* @author David DELON * @author Jean-Pascal MILCENT * @author Aurelien PERONNET * @license GPL v3 * @license CECILL v2 * @copyright 1999-2014 Tela Botanica */ class NameParser { /** * Whether to debug or nor * @var bool|integer */ public $debug_flag; /** * Sets value to the method property * @param mixed $name class property name * @param mixed $value class property value */ public function set($name, $value) { $this->$name = $value; } /** * Reduce Spaces * This will reduce the string to only allow once space between characters * @param string $str : string to reduce space * @return string : string with only once space between characters */ private function reduce_spaces($str) { $str = preg_replace('/ {2,}/', ' ', $str ); $str = trim( $str ); return( $str ); } /** * Function: parse_auth * Purpose: Produce a parsed version of authority of a taxon name * @author Tony Rees (Tony.Rees@csiro.au) * Date created: March 2008 * Inputs: authority string as str * Remarks: * (1) Performs authority expension of known abbreviated authornames using * table "auth_abbrev_test1" (must be available and populated with relevant content) * (2) Recognises "and", "et", "&" as equivalents (special case for "et al.") - all parsed to ampersand * (3) Recognises (e.g.) "Smith 1980" and "Smith, 1980" as equivalents - comma is removed in these cases * (4) Recognises (e.g.) "F. J. R. Taylor, 1980" and "F.J.R. Taylor, 1980" as equivalents - * extra space after full stops is ignored in these cases * (5) Returns uppercase string, diacritical marks intact * * @param string $str : authority string * @param integer $upcase : convert to uppercase if $upcase = 1 * @return string : parsed author string */ public function parse_auth($str, $upcase = 1) { $this->debug['parse_auth'][] = "1"; $temp = $str = trim($str); if ( ($str == NULL) || ($str == '') ) { $this->debug['parse_auth'][] = "1a"; return ''; } if ( ( $temp == null ) || ( $temp == '') ) { $this->debug['parse_auth'][] = "2a"; return(''); } else { $this->debug['parse_auth'][] = "2b"; // add space after full stops, except at end (NB, will also add spece before some close brackets) $temp = rtrim( str_replace('.', '. ', $temp) ); $this->debug['parse_auth'][] = "4 (temp:$temp)"; //normalise "et", "and" to ampersand (et al. is a special case) // if ( $temp like '% et al%' ) { if ( preg_match('/ et al/', $temp) ) { $temp = str_replace(' et al','zzzzz', $temp); $this->debug['parse_auth'][] = "4a (temp:$temp)"; } $temp = str_replace(' et ',' & ', $temp ); $temp = str_replace(' and ',' & ', $temp ); $temp = str_replace('zzzzz',' et al', $temp); $this->debug['parse_auth'][] = "5 (temp:$temp)"; //remove commas before dates (only) // like '%, 17%' if ( preg_match('/, 17/', $temp) ) { $temp = str_replace(', 17',' 17', $temp); $this->debug['parse_auth'][] = "5a (temp:$temp)"; } // like '%, 18%' if ( preg_match('/, 18/', $temp) ) { $temp = str_replace(', 18',' 18', $temp); $this->debug['parse_auth'][] = "5b (temp:$temp)"; } // like '%, 19%' if ( preg_match('/, 19/', $temp) ) { $temp = str_replace(', 19',' 19', $temp); $this->debug['parse_auth'][] = "5c (temp:$temp)"; } // like '%, 20%' if ( preg_match('/, 20/', $temp) ) { $temp = str_replace(', 20',' 20', $temp); $this->debug['parse_auth'][] = "5d (temp:$temp)"; } // reduce multiple internal spaces to single space $temp = $this->reduce_spaces( $temp ); // like '% -%' $temp = str_replace(' -', '-', $temp); $this->debug['parse_auth'][] = "6 (temp:$temp)"; foreach (explode(' ', $temp) as $this_word) { //$this->debug['parse_auth'][] = "7 (this_word:$this_word)"; $elapsed_chars = ''; // like '(%' if ( preg_match('/^\(/', $this_word) ) { $elapsed_chars .= '('; $this_word = substr( $this_word, 1 ); //$this->debug['parse_auth'][] = "7a (this_word:$this_word) (elapsed_chars:$elapsed_chars)"; } // Add back the word to the final translation $elapsed_chars .= $this_word . ' '; //$this->debug['parse_auth'][] = "7c (this_word:$this_word) (elapsed_chars:$elapsed_chars)"; } $elapsed_chars = $this->reduce_spaces( str_replace(' )', ')', $elapsed_chars) ); return trim( $elapsed_chars ) ; } } /** * Function: parse * Purpose: Produces parsed version of an input string (scientific name components) * @author Tony Rees (Tony.Rees@csiro.au) * Date created: June 2007-November 2008 * Inputs: input string as str (this version presumes genus, genus+species, or * genus+species+authority) * Outputs: parsed version of input string, for match purposes * Remarks: * (1) Removes known text elements e.g. * 'aff.', 'cf.', 'subsp.', subgenera if enclosed in brackets, etc. as desired * (2) Removes accented and non A-Z characters other than full stops * (in scientific name portions) * (3) Returns uppercase scientific name (genus + species only) * plus unaltered (presumed) authority * examples; * Anabaena cf. flos-aquae Ralfs ex Born. et Flah. => ANABAENA FLOSAQUAE Ralfs * ex Born. et Flah. * Abisara lemÈe-pauli => ABISARA LEMEEPAULI * Fuc/us Vesiculos2us => FUCUS VESICULOSUS * Buffo ignicolor LacÈpËde, 1788 => BUFFO IGNICOLOR LacÈpËde, 1788 * Barbatia (Mesocibota) bistrigata (Dunker, 1866) => BARBATIA BISTRIGATA (Dunker, 1866) * (4) Thus version does not handle genus+author, or genus+species+infraspecies * (second" good" term is presumed to be species epithet, anything after is * considered to be start of the authority), however could be adapted further as required * and actually it was done in this version for Tela Botanica * (5) There is a separate function "parse_auth" for normalizing authorities when required * (e.g. for authority comparisons) * * @param string $str : input string ( genus, genus+species, or genus+species+authority ) * @return string : parsed string */ public function parse( $str = NULL ) { unset($this->debug['parse']); $temp = ''; $first_str_part = NULL; $second_str_part = NULL; $temp_genus = ''; $temp_species = ''; $temp_genus_species = ''; $temp_authority = ''; $temp_infra = ''; //$this->debug['parse'][] = "1"; if ( ($str == NULL) || ( trim($str) == '') ) { //$this->debug[] = "N1a
"; return ''; } else { // trim any leading, trailing spaces or line feeds $temp = trim( $str ); //$this->debug['parse'][] = "1b"; } if ( $temp == NULL || $temp == '') { //$this->debug['parse'][] = "2a"; return ''; } else { //$this->debug['parse'][] = "2b"; // replace any HTML ampersands $set = array('%', '&', 'amp;%', 'AMP;%'); $temp = str_replace( $set, '&', $temp ); //$this->debug['parse'][] = "2b1 (temp:$temp)"; // remove any content in angle brackets (e.g. html tags - , , etc.) $html_pattern = '(\<(/?[^\>]+)\>)'; //? This should not just handle html tags but all <*> $temp = preg_replace( $html_pattern, '', $temp); //$this->debug['parse'][] = "2b2 (temp:$temp)"; // if second term (only) is in round brackets, presume it is a subgenus or a comment and remove it // examples: Barbatia (Mesocibota) bistrigata (Dunker, 1866) => Barbatia bistrigata (Dunker, 1866) // Barbatia (?) bistrigata (Dunker, 1866) => Barbatia bistrigata (Dunker, 1866) // (obviously this will not suit genus + author alone, where first part of authorname is in brackets, // however this is very rare?? and in any case we are not supporting genus+authority in this version) //if ( $temp like '% (%)%' $temp = preg_replace( '/ \(\w*\W*\)/', '', $temp, 1 ); //? Not sure if this will catch if //$this->debug['parse'][] = "2b3 (temp:$temp)"; // if second term (only) is in square brackets, presume it is a comment and remove it // example: Aphis [?] ficus Theobald, [1918] => Aphis ficus Theobald, [1918] //if ( $temp like '% [%]%' $temp = preg_replace( '/ \[\w*\W*\]/', '', $temp, 1 ); //? Not sure if this will catch if //$this->debug['parse'][] = "2b4 (temp:$temp)"; // drop indicators of questionable id's - presume all are lowercase for now (could extend as needed) $temp = preg_replace('/ cf /', ' ', $temp ); $temp = preg_replace('/ cf\. /', ' ', $temp ); $temp = preg_replace('/ near /', ' ', $temp ); $temp = preg_replace('/ aff\. /', ' ', $temp ); $temp = preg_replace('/ sp\. /', ' ', $temp ); $temp = preg_replace('/ spp\. /', ' ', $temp ); $temp = preg_replace('/ spp /', ' ', $temp ); //$this->debug['parse'][] = "2b5 (temp:$temp)"; // eliminate or close up any stray spaces introduced by the above $temp = $this->reduce_spaces( $temp ); //$this->debug['parse'][] = "2b6 (temp:$temp)"; // now presume first element is genus, second (if present) is species, remainder // (if present) is authority // look for genus name $ar = explode(' ', $temp, 2); if ( count( $ar ) ) { $temp_genus = $ar[0]; $temp = @$ar[1]; } else { $temp_genus = $temp; $temp = ''; } //$this->debug['parse'][] = "2b7 (temp_genus:$temp_genus) (temp:$temp)"; // look for species epithet and authority $ar = explode(' ', $temp, 2); if ( count( $ar ) ) { $temp_species = $ar[0]; $temp_authority = @$ar[1]; } else { $temp_species = $temp; $temp_authority = ''; } // look for subspecies $infras =array('subsp.','var.'); $temp_authority = preg_replace( "/ssp./", "subsp.", $temp_authority); $temp_authority = preg_replace( "/ssp /", "subsp.", $temp_authority); $temp_authority = preg_replace( "/subsp /", "subsp.", $temp_authority); $temp_authority = preg_replace( "/var /", "var.", $temp_authority); $temp_infra_authority = ''; $temp_infra_type = ''; foreach ($infras as $infra) { $pos = strpos($temp_authority, $infra); if ($pos === false) { continue; } else { $temp_infra=substr($temp_authority,$pos+strlen($infra)); $temp_authority=substr($temp_authority,0,$pos); $temp_infra=trim($temp_infra); $temp_infra_type=$infra; // look for infra epithet and authority $ar = explode(' ', $temp_infra, 2); if ( count( $ar ) ) { $temp_infra = $ar[0]; $temp_infra_authority = @$ar[1]; } break; // on s'arrete au premier trouve } } //$this->debug['parse'][] = "2b8 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)"; // replace selected ligatures here (Genus names can contain Æ, OE ligature) $temp_genus = str_replace( 'Æ', 'AE', $temp_genus); $temp_species = str_replace( 'Æ', 'AE', $temp_species); $temp_infra = str_replace( 'Æ', 'AE', $temp_infra ); //$this->debug['parse'][] = "2b9 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)"; $temp_genus= trim($temp_genus); $temp_species= trim($temp_species); $temp_infra= trim($temp_infra ); // reduce any new multiple internal spaces to single space, if present $temp_genus= $this->reduce_spaces( $temp_genus ); $temp_species= $this->reduce_spaces( $temp_species ); $temp_infra= $this->reduce_spaces( $temp_infra ); //$this->debug['parse'][] = "2b10 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)"; if (isset($temp_authority) && ($temp_authority!='') ) { $temp_authority=$this->parse_auth($temp_authority); } if (isset($temp_infra_authority) && ($temp_infra_authority!='') ) { $temp_infra_authority=$this->parse_auth($temp_infra_authority); } //$this->debug['parse'][] = "2b11 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)"; return array("genus"=>$temp_genus, "species"=>$temp_species, "authority"=>$temp_authority, "infra"=>$temp_infra, "infra_authority"=>$temp_infra_authority, "infra_type"=>$temp_infra_type); } } }