Tryag File Manager
Home
-
Turbo Force
Current Path :
/
home
/
cluster1
/
data
/
bu01
/
1121861
/
html
/
jlex
/
php4
/
Upload File :
New :
File
Dir
//home/cluster1/data/bu01/1121861/html/jlex/php4/db_modifier.php4
<? include_once("schema_loader.php4"); /** * db_modifier produces a modified version of an xml document. * * Modifications include: * <UL> * <LI> cDATA symbol substitution: substituting one type of symbol for another, e.g. 'a' for 'á'. The new content * is placed within a new tag with a '_s', e.g. <field>á</field> becomes <field_s>a</field_s>.</LI> * <LI> Field removal: specified xml tags and associated content can be discluded in the modified xml document.</LI> * <LI> alphabetization tag: A new tag is created <alpha> containing a value which will be used to sort query results. * The value will be determined by a set of conditions supplied by the user.</LI> * </UL> * * The parsing ability of this class actually provides two seperate but related parses. * <UL> * <LI>The first function produces a list of all the field types that exist within the xml document.</LI> * <LI>The second function does the actual modifications and produces a new XML document.</LI> * </UL> * I chose to use the same class to provide both functions because it seemed a waste to include an entire new * class to provide the functionality of creating a list of fields. In addition the results from this aggregation * are used by the modification function indirectly. The search functionality reports back not only which fields * exist in the XML document but also which fields contain characters contained in the $symbols and $first_chars arrays. * This information is generally published to the user, allowing him to select which fields he wants to include in the * MySQL version of his XML file and which fields he may want to add substituted versions of. * Perhaps in a future version this class should be split into two.<BR> * NOTE: In PHP4, you do not need to first decode cdata from utf8. PHP5 assumes the source is in utf8, so you must * first decode the cdata. * * @package JLex * @author Jonathan Dick * @date Dec. 30, 2005 */ class db_modifier { /**#@+ * @access public */ var $parser; /** * The file pointer for writing to disk. * @var resource */ var $out; /** * The current cDATA. * @var string */ var $cur_data; /** * The symbols to be substituted for. Note that I put these symbols and the substitutes in seperate arrays in order * to utilize the php function str_replace which takes in a symbols array and a substitutes array. * @var array of strings */ var $symbols; /** * An array of the substitue symbols. * @var array of strings */ var $substitutes; /** * The user may supply substitution rules which only apply to the first character. This array contains * the symbols which will be substituted for if found in the first character position. * @var array of strings */ var $first_chars; /** * This is a regular expression composed of the values from the $symbols and $first_chars array. * * It is used to determine if the content of a particular field meets the conditions specified by * the $symbols and $first_chars arrays, i.e. if it contains symbols found in those arrays. * @var string */ var $pattern; /** * The set of symbols to be replaced from the alpha field. * @var array of strings */ var $alpha_symbols; /** * The set of substitutes for the $alpha_symbols. * @var array of strings */ var $alpha_substitutes; /** * The set of first characters which should be substituted for in the alpha field. * @var array of strings */ var $alpha_first_chars; /** * The regular expression representing the conditions set forth in the $alpha_symbols and $alpha_first_chars arrays. * @var string */ var $alpha_pattern; /** * The fields which will be used to create the alpha column. * * The first field will be used in the alpha column if it exists. If that field does not exist * then the second field will be used and so on. The rules specified by the $alpha_symbols and * $alpha_first_chars will then be applied to the value of the field selected. * @var array of strings */ var $sort_fields; /** * A boolean variable which is true when sort fields are provided by the user. * */ var $has_sort_fields; /** * The current value of the alpha column. * * Because the document is parsed in a linear fashion, we don't know if a field exists * until we parse that field. Consequently, if any sort field is encountered we must save * the value and only discard once we reach a sort field of higher precedence. */ var $alpha; /** * The index within the array of the field whose content currently exists in the $alpha variable. * * When a field contained in the $sort_fields array is parsed, it's index is retrieved from the array * and compared to the value in $cur_alpha_index. If it is lower, the value of this field will replace * the content currently stored in $alpha. * * @var integer */ var $cur_alpha_index; /** * The XML tag marking the beginning of an entry. * @var string */ var $head_tag; /** * This boolean variable is true when the function being performed by this parse is modification. * @var boolean */ var $produce_stripped_version; /** * The set of fields in the XML document which contain cDATA obeying the rules found in the * $symbols and $first_chars arrays. * * This array is filled when parsing for field types, i.e. the first functionality discussed above. */ var $strippable_fields; /** * This variable holds two types of fieldsets depending on the function being implemented. If function 1 * (field search) is being used, this variable contains the growing list of fields found within entries in * the XML document. If function 2 (modification) is being used, this contains the set of fields to be * included in the modified version of the XML document. * @var array */ var $fields; /** * The set of fields for which modified versions should be created based on the rules in the * $symbols and $first_chars array. * @var array */ var $fields_to_strip; /** * Boolean variable which is set to true one a start $head_tag is found and false when an end $head_tag is parsed. * @var boolean */ var $in_entry; /**#@-*/ function db_modifier() { $this->fields_to_strip = array(); $this->check_all_fields = false; $this->cur_data = ""; $this->symbols = array(); $this->substitutes = array(); $this->pattern = ""; $this->alpha_symbols = array(); $this->alpha_substitutes = array(); $this->alpha_first_chars = array(); $this->alpha_pattern = ""; $this->sort_fields = array(); $this->has_sort_fields = false; $this->alpha = ""; $this->cur_alpha_index = 10; $this->strippable_fields = array(); $this->fields = array(); $this->fields_to_strip = array(); } /** * Extracts strings from space-seperated string and inputs them into the $sort_fields array. * @param string $sort_fields A space seperated list of fields contained in the XML document. * @return void The fields are inputted into the global variable $sort_fields. */ function get_sort_fields($sort_fields) { $fields = explode(" ",$sort_fields); foreach($fields as $field) { $this->sort_fields[] = $field; } } /** * Extracts fields, one per line, from a file and returns them in an array. * * This function is no longer used. * @param string $fields The filename containing the fields to be extracted. * @return array The fields contained within the file. */ function get_fields_to_strip($fields) { $in = fopen($fields,"r"); while($line = fgets($in)) { $field = trim($line); $fields_to_strip[] = $field; } return $fields_to_strip; } /** * This function takes user input specifying substitution rules contained in a textarea within an HTML form * and inputs the symbols into the $alpha_symbols array and their substitutes into the $alpha_substitutes array. * * Each line of the textarea should (if properly formatted by the user) a substitution rule in the form "X = Y". * Additionally, the '' is used to specify that a symbol will be deleted rather than subsituted for, e.g. "X = ''". * A user may specify multiple symbols to be replaed by a single substitute, e.g. "WXY = Z". * Lastly, all the symbols which need to be subsituted for are concatenated and made into a regular expression * * @param string $textarea A textarea from an HTML form. * @return void Only global variables are affected, no results are returned. */ function make_alpha_symbols_table($textarea) { $textarea = stripslashes($textarea); $textarea = trim(utf8_decode($textarea)); $lines = explode("\n",$textarea); foreach($lines as $line) { $vals = explode(" = ",$line); $substitute = trim($vals[1]); if($substitute == "''") { $substitute = ""; } $symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY); foreach($symbols as $symbol) { $this->alpha_symbols[] = $symbol; $this->alpha_substitutes[] = $substitute; } } $symbols_string = implode("",$this->alpha_symbols); if(substr($symbols_string,-1,1) == "'") { $symbols_string = "'".substr($symbols_string,0,-1); } if($symbols_string != "") { $this->alpha_pattern = "[".$symbols_string."]"; } } /** * This function takes user input specifying substitution rules contained in a textarea within an HTML form * and inputs the results into $alpha_first chars, an associative array indexed by the symbol to be replaced * and containing the substitute symbol. * * Note that in this case, two arrays are not used to store the symbols. This is because only the first character * should be substituted for, not a set of characters.<BR> * Each line of the textarea should (if properly formatted by the user) a substitution rule in the form "X = Y". * Additionally, the '' is used to specify that a symbol will be deleted rather than subsituted for, e.g. "X = ''". * A user may specify multiple symbols to be replaed by a single substitute, e.g. "WXY = Z". * Lastly, all the symbols which need to be subsituted for are concatenated and made into a regular expression * * @param string $textarea A textarea from an HTML form. * @return void Only global variables are affected, no results are returned. */ function make_alpha_first_chars_table($textarea) { $textarea = trim(stripslashes($textarea)); $textarea = utf8_decode($textarea); $lines = explode("\n",$textarea); foreach($lines as $line) { $vals = explode(" = ",$line); $substitute = trim($vals[1]); if($substitute == "''") { $substitute = ""; } $symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY); foreach($symbols as $symbol) { $this->alpha_first_chars[$symbol] = $substitute; } } $first_chars_string = trim(implode("",array_keys($this->alpha_first_chars))); if(substr($first_chars_string,-1,1) == "'") { $first_chars_string = "'".substr($first_chars_string,0,-1); } if($first_chars_string != "") { if($this->alpha_pattern != "") { $this->alpha_pattern .= "|"; } $this->alpha_pattern .= "^[".$first_chars_string."]"; } } /** * This function takes user input specifying substitution rules contained in a textarea within an HTML form * and inputs the symbols into the $symbols array and their substitutes into the $substitutes array. * * Each line of the textarea should (if properly formatted by the user) a substitution rule in the form "X = Y". * Additionally, the '' is used to specify that a symbol will be deleted rather than subsituted for, e.g. "X = ''". * A user may specify multiple symbols to be replaed by a single substitute, e.g. "WXY = Z". * Lastly, all the symbols which need to be subsituted for are concatenated and made into a regular expression * * @param string $textarea A textarea from an HTML form. * @return void Only global variables are affected, no results are returned. */ function make_symbols_table($textarea) { $textarea = stripslashes($textarea); $textarea = trim(utf8_decode($textarea)); $lines = explode("\n",$textarea); foreach($lines as $line) { $vals = explode(" = ",$line); $substitute = trim($vals[1]); if($substitute == "''") { $substitute = ""; } $symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY); foreach($symbols as $symbol) { $this->symbols[] = $symbol; $this->substitutes[] = $substitute; } } $symbols_string = implode("",$this->symbols); if(substr($symbols_string,-1,1) == "'") { $symbols_string = "'".substr($symbols_string,0,-1); } if($symbols_string != "") { $this->pattern = "[".$symbols_string."]"; } } /** * This function takes user input specifying substitution rules contained in a textarea within an HTML form * and inputs the results into $first chars, an associative array indexed by the symbol to be replaced * and containing the substitute symbol. * * Note that in this case, two arrays are not used to store the symbols. This is because only the first character * should be substituted for, not a set of characters.<BR> * Each line of the textarea should (if properly formatted by the user) a substitution rule in the form "X = Y". * Additionally, the '' is used to specify that a symbol will be deleted rather than subsituted for, e.g. "X = ''". * A user may specify multiple symbols to be replaed by a single substitute, e.g. "WXY = Z". * Lastly, all the symbols which need to be subsituted for are concatenated and made into a regular expression * * @param string $textarea A textarea from an HTML form. * @return void Only global variables are affected, no results are returned. */ function make_first_chars_table($textarea) { $textarea = trim(stripslashes($textarea)); $textarea = utf8_decode($textarea); $lines = explode("\n",$textarea); foreach($lines as $line) { $vals = explode(" = ",$line); $substitute = trim($vals[1]); if($substitute == "''") { $substitute = ""; } $symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY); foreach($symbols as $symbol) { $this->first_chars[$symbol] = $substitute; } } $first_chars_string = trim(implode("",array_keys($this->first_chars))); if(substr($first_chars_string,-1,1) == "'") { $first_chars_string = "'".substr($first_chars_string,0,-1); } if($first_chars_string != "") { if($this->pattern != "") { $this->pattern .= "|"; } $this->pattern .= "^[".$first_chars_string."]"; } } /** * Replace the specified $value with the stripped version according to the conditions * specified by $alpha_symbols and $alpha_first_chars. * * @param string $value The content to be modified. * @return string The modified content. */ function alpha_strip($value) { $stripped = str_replace($this->alpha_symbols,$this->alpha_substitutes,$value); $first_char = substr($stripped,0,1); while(@array_key_exists($first_char,$this->alpha_first_chars)) { $stripped = $this->alpha_first_chars[$first_char].substr($stripped,1); $first_char = substr($stripped,0,1); } return $stripped; } /** * Replace the specified $value with the stripped version according to the conditions * specified by $alpha_symbols and $alpha_first_chars. * * @param string $value The content to be modified. * @return string The modified content. */ function strip($value) { $stripped = str_replace($this->symbols,$this->substitutes,$value); $first_char = substr($stripped,0,1); while(@array_key_exists($first_char,$this->first_chars)) { $stripped = $this->first_chars[$first_char].substr($stripped,1); $first_char = substr($stripped,0,1); } return $stripped; } /** * The function which handles a new start tag during XML parsing. * * If function 1 (field searching) is used, do nothing. <BR> * If function 2 (modification) is used, write the element to the new, modified XML document if: * <UL> * <LI> The field exists withing $fields.</LI> * <LI> The field is outside of an entry.</LI> * </UL> */ function startHandler($xp, $element, $attribs) { $element = strtolower($element); if($element == $this->head_tag) { $this->in_entry = true; } if($this->produce_stripped_version) { if($this->in_entry) { if(in_array($element,$this->fields)) { fwrite($this->out,"<$element>"); } } else { fwrite($this->out,"<$element>\n"); } } } /** * The function for handling an end tag during an XML parse. * * If function 2 (modification) is being used: * <UL> * <LI>Check to see if the field is a sort field and make the changes as necessary.</LI> * <LI>If the element is a head tag, we are at then end of an entry. So, write the alpha column * into the modified XML document, set $alpha to the empty string and set $cur_alpha_index * to the highest number it can be (this ensures that the first sort field found will be used).</LI> * <LI>If the element is in $fields, write $cur_data to disk and a close tag for this field.</LI> * <LI>If $in_entry is false, write the cDATA and a close tag. Nothing should be removed from the original * XML document that is not explicitly specified by the modification rules.</LI> * </UL> * If function 1 (field search) is being used: if the element is not already there, add it to $fields.<BR> * Lastly, check to see if the field meets the conditions set in $pattern. If there is a pattern match: * <UL> * <LI>If in function 1, add the field (if it doesn't already exist) to $strippable_fields.</LI> * <LI>If in function 2: if the modified field exists in $fields_to_strip, write the modified field * to the new XML document.</LI> * </UL> */ function endHandler($xp, $element) { $element = strtolower($element); if($this->produce_stripped_version) { if($this->has_sort_fields && in_array($element, $this->sort_fields)) { $index = array_search($element,$this->sort_fields); if($index < $this->cur_alpha_index) { $this->alpha = $this->alpha_strip($this->cur_data); $this->cur_alpha_index = $index; } } if(($element == $this->head_tag) && $this->has_sort_fields) { fwrite($this->out,"<alpha>$this->alpha</alpha>\n"); $this->alpha = ""; $this->cur_alpha_index = count($this->sort_fields); } if($this->in_entry) { if(in_array($element,$this->fields)) { fwrite($this->out,"$this->cur_data</$element>\n"); } } else { fwrite($this->out,"$this->cur_data</$element>\n"); } } else { if($this->in_entry) { if(!in_array($element,$this->fields)) { $this->fields[] = $element; } } } if($this->pattern != "") { if(ereg($this->pattern,$this->cur_data)) { if($this->produce_stripped_version) { if(in_array($element,$this->fields_to_strip)) { $stripped_value = $this->strip($this->cur_data); fwrite($this->out,"<".$element."_s>$stripped_value</".$element."_s>\n"); } } else { if(!in_array($element,$this->strippable_fields)) { $this->strippable_fields[] = $element; } } } } if($element == $this->head_tag) { $this->in_entry = false; } $this->cur_data = ""; } /** * Function for handling cDATA during parsing. If the cDATA is not an empty string, append it to then end * of $cur_data. */ function cDataHandler($xp, $data) { $data = trim($data); if($data != "") { $data = ereg_replace("&","&",$data); $data = ereg_replace("<","<",$data); $data = ereg_replace(">",">",$data); $this->cur_data .= $data; } } /** * A function to remove html encodings of special characters. */ function unhtmlentities ($string) { $trans_tbl = get_html_translation_table (HTML_ENTITIES); $trans_tbl = array_flip ($trans_tbl); $ret = strtr ($string, $trans_tbl); return preg_replace('/\&\#([0-9]+)\;/me', "chr('\\1')",$ret); } /** * Function for parsing an XML document and collecting the set of all fields used within entries * and collecting the set of all fields containing characters meeting the $pattern conditions. * * As noted above, this is the first function, which only searches but produces no new XML document. * * @param string $xml The filename of the XML document to be parsed. * @param string $head_tag The tag marking the beginning of an entry. * @param $symbols string The textarea containing the rules for symbol substitition * @param $first_chars The textarea containing the rules for first character substitution. * @return void The fields are sorted in the global variables $fields and $strippable fields. */ function get_fields($xml,$head_tag,$symbols,$first_chars) { $this->parser = xml_parser_create(); xml_set_object($this->parser,$this); xml_set_element_handler($this->parser,"startHandler","endHandler"); xml_set_character_data_handler($this->parser,"cDataHandler"); $this->head_tag = $head_tag; if(trim($symbols) != "") { $this->make_symbols_table($symbols); } if(trim($first_chars) != "") { $this->make_first_chars_table($first_chars); } $this->produce_stripped_version = false; $in = fopen($xml,"r"); while($line = fgets($in)) { xml_parse($this->parser,$line,false); } fclose($in); sort($this->strippable_fields); sort($this->fields); } /** * This function produces a modified version of an XML document based on conditions set forth in the required * parameters. * * First, set $produce_stripped_version to true. This tells the parsing functions which function is being * utilized. <BR> * Second, enter all the user conditions into the relevant global variables. <BR> * Third, parse the document and produce the modified XML document. * * @param string $xml The name of the $xml file to be modified. * @param string $new_xml_name The name of the new modified XML file. This is used in case the user wants to save * then new XML file. * @param string $head_tag The name of tag enclosing an entry. * @param string $sort_fields A space seperated list of fields which will be used to create the alpha field. * @param string $alpha_symbols The textarea containing the substitution rules for the alpha field. * @param string $alpha_firs_chars The textarea containing the substitution rules for the first character of the * alpha field. * @param string $symbols The textarea containing the substitution rules for the value of fields in $fields. * @param string $first_chars The textarea containing the substitution rules for the first character of fields * in $fields. * @param array $fields An array used to store a list of all fields within entries in the XML document. * @param array $fields_to_strip An array of the fields on which the substitution rules will be applied. */ function modify($xml,$new_xml_name, $head_tag, $sort_fields, $alpha_symbols, $alpha_first_chars, $symbols, $first_chars, $fields, $fields_to_strip) { $this->produce_stripped_version = true; $this->parser = xml_parser_create(); xml_set_object($this->parser,$this); xml_set_element_handler($this->parser,"startHandler","endHandler"); xml_set_character_data_handler($this->parser,"cDataHandler"); $this->fields = $fields; $this->fields_to_strip = $fields_to_strip; $this->head_tag = $head_tag; if($sort_fields != "") { $this->get_sort_fields($sort_fields); $this->has_sort_fields = true; $this->fields[] = "alpha"; } else { $this->has_sort_fields = false; } if(trim($symbols) != "") { $this->make_symbols_table($symbols); } if(trim($first_chars) != "") { $this->make_first_chars_table($first_chars); } if(trim($alpha_symbols) != "") { $this->make_alpha_symbols_table($alpha_symbols); } if(trim($alpha_first_chars) != "") { $this->make_alpha_first_chars_table($alpha_first_chars); } $this->out = fopen($new_xml_name,"w"); fwrite($this->out,"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n"); $in = fopen($xml,"r"); while($line = fgets($in)) { xml_parse($this->parser,$line,false); } fclose($in); fclose($this->out); xml_parser_free($this->parser); } } /* //NOTE: you must remove utf8_encode from the make_symbol_table functions when testing from the command line set_time_limit(0); $converter = new db_modifier(); $sort_fields = "lxa lxo"; $symbols = "á = a\né = e\ní = i\nó = o\nú = u\nÁ = A\nÉ = E\nÍ = I\nÓ = O\nÚ = U\nÑ = N\nñ = n\nü = u"; $first_chars = "-' = ''"; $alpha_symbols = "á = a"; $alpha_first_chars = "-' = ''"; $fields_not_to_strip = array("compound","disamb","enc_note","fl","fla","flao","flo","grm","grmx","irregv","lex_aff","mod","nae","ncol","nde","nfe","nse","nss","pea","peo","pl_com","plo","qry","qry_fon","qry_h","root_notes","se","sea","seao","sem","seo","spko","src","subadj"); $converter->get_fields("test.xml",$symbols,$first_chars); foreach($converter->strippable_fields as $field) { echo "$field \n"; } $fields_to_strip = array_diff($converter->strippable_fields,$fields_not_to_strip); $converter->modify("ActiveNahuatl_2005_august_final.xml","with_stripped.xml","refgroup", $sort_fields,$alpha_symbols,$alpha_first_chars,$symbols,$first_chars, $converter->fields,$fields_to_strip); */ ?>