Rev 1318 | Rev 2462 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
<?php/*** Taxamatch-Webservice PHP v1.0.0* @author Michael Giddens* @link http://www.silverbiology.com*//* Adapation par David Delon Decembre 2010 : gestion sous espece*//*** Class NameParser* Used to convert a string to a standarized format.*/class NameParser {/*** Whether to debug or nor* @var bool|integer*/public $debug_flag;/*** Constructor*/public function __construct( ) {}/*** Sets value to the method property* @param mixed $name class property name* @param mixed $value class property value*/public function set($name,$value) {$this->$name = $value;}/*** Reduce Spaces* This will reduce the string to only allow once space between characters* @param string $str : string to reduce space* @return string : string with only once space between characters*/private function reduce_spaces( $str ) {$str = preg_replace("/ {2,}/", ' ', $str );$str = trim( $str );return( $str );}/*** Function: parse_auth* Purpose: Produce a parsed version of authority of a taxon name* @author Tony Rees (Tony.Rees@csiro.au)* Date created: March 2008* Inputs: authority string as str* Remarks:* (1) Performs authority expension of known abbreviated authornames using* table "auth_abbrev_test1" (must be available and populated with relevant content)* (2) Recognises "and", "et", "&" as equivalents (special case for "et al.") - all parsed to ampersand* (3) Recognises (e.g.) "Smith 1980" and "Smith, 1980" as equivalents - comma is removed in these cases* (4) Recognises (e.g.) "F. J. R. Taylor, 1980" and "F.J.R. Taylor, 1980" as equivalents -* extra space after full stops is ignored in these cases* (5) Returns uppercase string, diacritical marks intact** @param string $str : authority string* @param integer $upcase : convert to uppercase if $upcase = 1* @return string : parsed author string*/public function parse_auth( $str, $upcase=1 ) {$this->debug['parse_auth'][] = "1";$temp = $str = trim($str);if ( ($str == NULL) || ($str == '') ) {$this->debug['parse_auth'][] = "1a";return '';}if ( ( $temp == null ) || ( $temp == '') ) {$this->debug['parse_auth'][] = "2a";return('');} else {$this->debug['parse_auth'][] = "2b";// add space after full stops, except at end (NB, will also add spece before some close brackets)$temp = rtrim( str_replace('.', '. ', $temp) );$this->debug['parse_auth'][] = "4 (temp:$temp)";//normalise "et", "and" to ampersand (et al. is a special case)// if ( $temp like '% et al%' ) {if ( preg_match('/ et al/', $temp) ) {$temp = str_replace(' et al','zzzzz', $temp);$this->debug['parse_auth'][] = "4a (temp:$temp)";}$temp = str_replace(' et ',' & ', $temp );$temp = str_replace(' and ',' & ', $temp );$temp = str_replace('zzzzz',' et al', $temp);$this->debug['parse_auth'][] = "5 (temp:$temp)";//remove commas before dates (only)// like '%, 17%'if ( preg_match('/, 17/', $temp) ) {$temp = str_replace(', 17',' 17', $temp);$this->debug['parse_auth'][] = "5a (temp:$temp)";}// like '%, 18%'if ( preg_match('/, 18/', $temp) ) {$temp = str_replace(', 18',' 18', $temp);$this->debug['parse_auth'][] = "5b (temp:$temp)";}// like '%, 19%'if ( preg_match('/, 19/', $temp) ) {$temp = str_replace(', 19',' 19', $temp);$this->debug['parse_auth'][] = "5c (temp:$temp)";}// like '%, 20%'if ( preg_match('/, 20/', $temp) ) {$temp = str_replace(', 20',' 20', $temp);$this->debug['parse_auth'][] = "5d (temp:$temp)";}// reduce multiple internal spaces to single space$temp = $this->reduce_spaces( $temp );// like '% -%'$temp = str_replace(' -', '-', $temp);$this->debug['parse_auth'][] = "6 (temp:$temp)";foreach( explode(' ', $temp) as $this_word ) {//$this->debug['parse_auth'][] = "7 (this_word:$this_word)";$elapsed_chars = '';// like '(%'if ( preg_match('/^\(/', $this_word) ) {$elapsed_chars .= '(';$this_word = substr( $this_word, 1 );//$this->debug['parse_auth'][] = "7a (this_word:$this_word) (elapsed_chars:$elapsed_chars)";}// Add back the word to the final translation$elapsed_chars .= $this_word . ' ';//$this->debug['parse_auth'][] = "7c (this_word:$this_word) (elapsed_chars:$elapsed_chars)";}$elapsed_chars = $this->reduce_spaces( str_replace(' )', ')', $elapsed_chars) );return trim( $elapsed_chars ) ;}}/*** Function: parse* Purpose: Produces parsed version of an input string (scientific name components)* @author Tony Rees (Tony.Rees@csiro.au)* Date created: June 2007-November 2008* Inputs: input string as str (this version presumes genus, genus+species, or* genus+species+authority)* Outputs: parsed version of input string, for match purposes* Remarks:* (1) Removes known text elements e.g.* 'aff.', 'cf.', 'subsp.', subgenera if enclosed in brackets, etc. as desired* (2) Removes accented and non A-Z characters other than full stops* (in scientific name portions)* (3) Returns uppercase scientific name (genus + species only)* plus unaltered (presumed) authority* examples;* Anabaena cf. flos-aquae Ralfs ex Born. et Flah. => ANABAENA FLOSAQUAE Ralfs* ex Born. et Flah.* Abisara lemÈe-pauli => ABISARA LEMEEPAULI* Fuc/us Vesiculos2us => FUCUS VESICULOSUS* Buffo ignicolor LacÈpËde, 1788 => BUFFO IGNICOLOR LacÈpËde, 1788* Barbatia (Mesocibota) bistrigata (Dunker, 1866) => BARBATIA BISTRIGATA (Dunker, 1866)* (4) Thus version does not handle genus+author, or genus+species+infraspecies* (second" good" term is presumed to be species epithet, anything after is* considered to be start of the authority), however could be adapted further as required* and actually it was done in this version for Tela Botanica* (5) There is a separate function "parse_auth" for normalizing authorities when required* (e.g. for authority comparisons)** @param string $str : input string ( genus, genus+species, or genus+species+authority )* @return string : parsed string*/public function parse( $str = NULL ) {unset($this->debug['parse']);$temp = '';$first_str_part = NULL;$second_str_part = NULL;$temp_genus = '';$temp_species = '';$temp_genus_species = '';$temp_authority = '';$temp_infra = '';//$this->debug['parse'][] = "1";if ( ($str == NULL) || ( trim($str) == '') ) {//$this->debug[] = "N1a<br>";return '';} else {// trim any leading, trailing spaces or line feeds$temp = trim( $str );//$this->debug['parse'][] = "1b";}if ( $temp == NULL || $temp == '') {//$this->debug['parse'][] = "2a";return '';} else {//$this->debug['parse'][] = "2b";// replace any HTML ampersands$set = array('%', '&', 'amp;%', 'AMP;%');$temp = str_replace( $set, '&', $temp );//$this->debug['parse'][] = "2b1 (temp:$temp)";// remove any content in angle brackets (e.g. html tags - <i>, </i>, etc.)$html_pattern = "(\<(/?[^\>]+)\>)";//? This should not just handle html tags but all <*>$temp = preg_replace( $html_pattern, '', $temp);//$this->debug['parse'][] = "2b2 (temp:$temp)";// if second term (only) is in round brackets, presume it is a subgenus or a comment and remove it// examples: Barbatia (Mesocibota) bistrigata (Dunker, 1866) => Barbatia bistrigata (Dunker, 1866)// Barbatia (?) bistrigata (Dunker, 1866) => Barbatia bistrigata (Dunker, 1866)// (obviously this will not suit genus + author alone, where first part of authorname is in brackets,// however this is very rare?? and in any case we are not supporting genus+authority in this version)//if ( $temp like '% (%)%'$temp = preg_replace( "/ \(\w*\W*\)/", '', $temp, 1 );//? Not sure if this will catch if//$this->debug['parse'][] = "2b3 (temp:$temp)";// if second term (only) is in square brackets, presume it is a comment and remove it// example: Aphis [?] ficus Theobald, [1918] => Aphis ficus Theobald, [1918]//if ( $temp like '% [%]%'$temp = preg_replace( "/ \[\w*\W*\]/", '', $temp, 1 );//? Not sure if this will catch if//$this->debug['parse'][] = "2b4 (temp:$temp)";// drop indicators of questionable id's - presume all are lowercase for now (could extend as needed)$temp = preg_replace( "/ cf /", " ", $temp );$temp = preg_replace( "/ cf\. /", " ", $temp );$temp = preg_replace( "/ near /", " ", $temp );$temp = preg_replace( "/ aff\. /", " ", $temp );$temp = preg_replace( "/ sp\. /", " ", $temp );$temp = preg_replace( "/ spp\. /", " ", $temp );$temp = preg_replace( "/ spp /", " ", $temp );//$this->debug['parse'][] = "2b5 (temp:$temp)";// eliminate or close up any stray spaces introduced by the above$temp = $this->reduce_spaces( $temp );//$this->debug['parse'][] = "2b6 (temp:$temp)";// now presume first element is genus, second (if present) is species, remainder// (if present) is authority// look for genus name$ar = explode( " ", $temp, 2);if ( count( $ar ) ) {$temp_genus = $ar[0];$temp = @$ar[1];} else {$temp_genus = $temp;$temp = '';}//$this->debug['parse'][] = "2b7 (temp_genus:$temp_genus) (temp:$temp)";// look for species epithet and authority$ar = explode( " ", $temp, 2);if ( count( $ar ) ) {$temp_species = $ar[0];$temp_authority = @$ar[1];} else {$temp_species = $temp;$temp_authority = '';}// look for subspecies$infras =array('subsp.','var.');$temp_authority = preg_replace( "/ssp./", "subsp.", $temp_authority);$temp_authority = preg_replace( "/ssp /", "subsp.", $temp_authority);$temp_authority = preg_replace( "/subsp /", "subsp.", $temp_authority);$temp_authority = preg_replace( "/var /", "var.", $temp_authority);$temp_infra_authority = '';$temp_infra_type = '';foreach ($infras as $infra) {$pos = strpos($temp_authority, $infra);if ($pos === false) {continue;} else {$temp_infra=substr($temp_authority,$pos+strlen($infra));$temp_authority=substr($temp_authority,0,$pos);$temp_infra=trim($temp_infra);$temp_infra_type=$infra;// look for infra epithet and authority$ar = explode(" ", $temp_infra, 2);if ( count( $ar ) ) {$temp_infra = $ar[0];$temp_infra_authority = @$ar[1];}break; // on s'arrete au premier trouve}}//$this->debug['parse'][] = "2b8 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";// replace selected ligatures here (Genus names can contain √Ü, OE ligature)$temp_genus = str_replace( '√Ü', 'AE', $temp_genus);$temp_species = str_replace( '√Ü', 'AE', $temp_species);$temp_infra = str_replace( '√Ü', 'AE', $temp_infra );//$this->debug['parse'][] = "2b9 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";$temp_genus= trim($temp_genus);$temp_species= trim($temp_species);$temp_infra= trim($temp_infra );// reduce any new multiple internal spaces to single space, if present$temp_genus= $this->reduce_spaces( $temp_genus );$temp_species= $this->reduce_spaces( $temp_species );$temp_infra= $this->reduce_spaces( $temp_infra );//$this->debug['parse'][] = "2b10 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";if (isset($temp_authority) && ($temp_authority!='') ) {$temp_authority=$this->parse_auth($temp_authority);}if (isset($temp_infra_authority) && ($temp_infra_authority!='') ) {$temp_infra_authority=$this->parse_auth($temp_infra_authority);}//$this->debug['parse'][] = "2b11 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";return array("genus"=>$temp_genus, "species"=>$temp_species, "authority"=>$temp_authority, "infra"=>$temp_infra, "infra_authority"=>$temp_infra_authority, "infra_type"=>$temp_infra_type);}} // End NameParser} // End Class?>