1,19 → 1,27 |
<?php |
|
// declare(encoding='UTF-8'); |
/** |
* Classe permettant de convertir une chaine d'un nom scientifique en un format standard. |
* |
* Source orignale : |
* Taxamatch-Webservice PHP v1.0.0 |
* @author Michael Giddens |
* @link http://www.silverbiology.com |
* |
* |
* @internal Mininum PHP version : 5.2 |
* @category CEL |
* @package Services |
* @subpackage Bibliothèques |
* @version 0.1 |
* @author Mathias CHOUET <mathias@tela-botanica.org> |
* @author David DELON <david@clapas.net> |
* @author Jean-Pascal MILCENT <jpm@tela-botanica.org> |
* @author Aurelien PERONNET <aurelien@tela-botanica.org> |
* @license GPL v3 <http://www.gnu.org/licenses/gpl.txt> |
* @license CECILL v2 <http://www.cecill.info/licences/Licence_CeCILL_V2-en.txt> |
* @copyright 1999-2014 Tela Botanica <accueil@tela-botanica.org> |
*/ |
|
/* Adapation par David Delon Decembre 2010 : gestion sous espece |
*/ |
|
|
/** |
* Class NameParser |
* Used to convert a string to a standarized format. |
*/ |
class NameParser { |
|
/** |
22,20 → 30,12 |
*/ |
public $debug_flag; |
|
|
|
/** |
* Constructor |
*/ |
public function __construct( ) { |
} |
|
/** |
* Sets value to the method property |
* @param mixed $name class property name |
* @param mixed $value class property value |
*/ |
public function set($name,$value) { |
public function set($name, $value) { |
$this->$name = $value; |
} |
|
45,11 → 45,9 |
* @param string $str : string to reduce space |
* @return string : string with only once space between characters |
*/ |
private function reduce_spaces( $str ) { |
|
$str = preg_replace("/ {2,}/", ' ', $str ); |
private function reduce_spaces($str) { |
$str = preg_replace('/ {2,}/', ' ', $str ); |
$str = trim( $str ); |
|
return( $str ); |
} |
|
72,11 → 70,10 |
* @param integer $upcase : convert to uppercase if $upcase = 1 |
* @return string : parsed author string |
*/ |
public function parse_auth( $str, $upcase=1 ) { |
|
public function parse_auth($str, $upcase = 1) { |
$this->debug['parse_auth'][] = "1"; |
$temp = $str = trim($str); |
|
|
if ( ($str == NULL) || ($str == '') ) { |
$this->debug['parse_auth'][] = "1a"; |
return ''; |
88,11 → 85,11 |
} else { |
|
$this->debug['parse_auth'][] = "2b"; |
|
|
// add space after full stops, except at end (NB, will also add spece before some close brackets) |
$temp = rtrim( str_replace('.', '. ', $temp) ); |
$this->debug['parse_auth'][] = "4 (temp:$temp)"; |
|
|
//normalise "et", "and" to ampersand (et al. is a special case) |
// if ( $temp like '% et al%' ) { |
if ( preg_match('/ et al/', $temp) ) { |
99,7 → 96,7 |
$temp = str_replace(' et al','zzzzz', $temp); |
$this->debug['parse_auth'][] = "4a (temp:$temp)"; |
} |
|
|
$temp = str_replace(' et ',' & ', $temp ); |
$temp = str_replace(' and ',' & ', $temp ); |
|
106,7 → 103,7 |
$temp = str_replace('zzzzz',' et al', $temp); |
|
$this->debug['parse_auth'][] = "5 (temp:$temp)"; |
|
|
//remove commas before dates (only) |
// like '%, 17%' |
if ( preg_match('/, 17/', $temp) ) { |
113,35 → 110,34 |
$temp = str_replace(', 17',' 17', $temp); |
$this->debug['parse_auth'][] = "5a (temp:$temp)"; |
} |
|
|
// like '%, 18%' |
if ( preg_match('/, 18/', $temp) ) { |
$temp = str_replace(', 18',' 18', $temp); |
$this->debug['parse_auth'][] = "5b (temp:$temp)"; |
} |
|
|
// like '%, 19%' |
if ( preg_match('/, 19/', $temp) ) { |
$temp = str_replace(', 19',' 19', $temp); |
$this->debug['parse_auth'][] = "5c (temp:$temp)"; |
} |
|
|
// like '%, 20%' |
if ( preg_match('/, 20/', $temp) ) { |
$temp = str_replace(', 20',' 20', $temp); |
$this->debug['parse_auth'][] = "5d (temp:$temp)"; |
} |
|
|
// reduce multiple internal spaces to single space |
$temp = $this->reduce_spaces( $temp ); |
|
|
// like '% -%' |
$temp = str_replace(' -', '-', $temp); |
|
$this->debug['parse_auth'][] = "6 (temp:$temp)"; |
|
foreach( explode(' ', $temp) as $this_word ) { |
|
|
foreach (explode(' ', $temp) as $this_word) { |
//$this->debug['parse_auth'][] = "7 (this_word:$this_word)"; |
$elapsed_chars = ''; |
// like '(%' |
155,14 → 151,11 |
$elapsed_chars .= $this_word . ' '; |
//$this->debug['parse_auth'][] = "7c (this_word:$this_word) (elapsed_chars:$elapsed_chars)"; |
} |
|
$elapsed_chars = $this->reduce_spaces( str_replace(' )', ')', $elapsed_chars) ); |
|
return trim( $elapsed_chars ) ; |
} |
} |
|
} |
|
/** |
* Function: parse |
* Purpose: Produces parsed version of an input string (scientific name components) |
174,12 → 167,12 |
* Remarks: |
* (1) Removes known text elements e.g. |
* 'aff.', 'cf.', 'subsp.', subgenera if enclosed in brackets, etc. as desired |
* (2) Removes accented and non A-Z characters other than full stops |
* (2) Removes accented and non A-Z characters other than full stops |
* (in scientific name portions) |
* (3) Returns uppercase scientific name (genus + species only) |
* (3) Returns uppercase scientific name (genus + species only) |
* plus unaltered (presumed) authority |
* examples; |
* Anabaena cf. flos-aquae Ralfs ex Born. et Flah. => ANABAENA FLOSAQUAE Ralfs |
* Anabaena cf. flos-aquae Ralfs ex Born. et Flah. => ANABAENA FLOSAQUAE Ralfs |
* ex Born. et Flah. |
* Abisara lemÈe-pauli => ABISARA LEMEEPAULI |
* Fuc/us Vesiculos2us => FUCUS VESICULOSUS |
186,7 → 179,7 |
* Buffo ignicolor LacÈpËde, 1788 => BUFFO IGNICOLOR LacÈpËde, 1788 |
* Barbatia (Mesocibota) bistrigata (Dunker, 1866) => BARBATIA BISTRIGATA (Dunker, 1866) |
* (4) Thus version does not handle genus+author, or genus+species+infraspecies |
* (second" good" term is presumed to be species epithet, anything after is |
* (second" good" term is presumed to be species epithet, anything after is |
* considered to be start of the authority), however could be adapted further as required |
* and actually it was done in this version for Tela Botanica |
* (5) There is a separate function "parse_auth" for normalizing authorities when required |
196,9 → 189,7 |
* @return string : parsed string |
*/ |
public function parse( $str = NULL ) { |
|
unset($this->debug['parse']); |
|
$temp = ''; |
$first_str_part = NULL; |
$second_str_part = NULL; |
207,7 → 198,7 |
$temp_genus_species = ''; |
$temp_authority = ''; |
$temp_infra = ''; |
|
|
//$this->debug['parse'][] = "1"; |
|
if ( ($str == NULL) || ( trim($str) == '') ) { |
232,8 → 223,8 |
//$this->debug['parse'][] = "2b1 (temp:$temp)"; |
|
// remove any content in angle brackets (e.g. html tags - <i>, </i>, etc.) |
$html_pattern = "(\<(/?[^\>]+)\>)"; |
//? This should not just handle html tags but all <*> |
$html_pattern = '(\<(/?[^\>]+)\>)'; |
//? This should not just handle html tags but all <*> |
$temp = preg_replace( $html_pattern, '', $temp); |
//$this->debug['parse'][] = "2b2 (temp:$temp)"; |
|
243,25 → 234,25 |
// (obviously this will not suit genus + author alone, where first part of authorname is in brackets, |
// however this is very rare?? and in any case we are not supporting genus+authority in this version) |
//if ( $temp like '% (%)%' |
$temp = preg_replace( "/ \(\w*\W*\)/", '', $temp, 1 ); |
//? Not sure if this will catch if |
$temp = preg_replace( '/ \(\w*\W*\)/', '', $temp, 1 ); |
//? Not sure if this will catch if |
//$this->debug['parse'][] = "2b3 (temp:$temp)"; |
|
// if second term (only) is in square brackets, presume it is a comment and remove it |
// example: Aphis [?] ficus Theobald, [1918] => Aphis ficus Theobald, [1918] |
// example: Aphis [?] ficus Theobald, [1918] => Aphis ficus Theobald, [1918] |
//if ( $temp like '% [%]%' |
$temp = preg_replace( "/ \[\w*\W*\]/", '', $temp, 1 ); |
//? Not sure if this will catch if |
$temp = preg_replace( '/ \[\w*\W*\]/', '', $temp, 1 ); |
//? Not sure if this will catch if |
//$this->debug['parse'][] = "2b4 (temp:$temp)"; |
|
// drop indicators of questionable id's - presume all are lowercase for now (could extend as needed) |
$temp = preg_replace( "/ cf /", " ", $temp ); |
$temp = preg_replace( "/ cf\. /", " ", $temp ); |
$temp = preg_replace( "/ near /", " ", $temp ); |
$temp = preg_replace( "/ aff\. /", " ", $temp ); |
$temp = preg_replace( "/ sp\. /", " ", $temp ); |
$temp = preg_replace( "/ spp\. /", " ", $temp ); |
$temp = preg_replace( "/ spp /", " ", $temp ); |
$temp = preg_replace('/ cf /', ' ', $temp ); |
$temp = preg_replace('/ cf\. /', ' ', $temp ); |
$temp = preg_replace('/ near /', ' ', $temp ); |
$temp = preg_replace('/ aff\. /', ' ', $temp ); |
$temp = preg_replace('/ sp\. /', ' ', $temp ); |
$temp = preg_replace('/ spp\. /', ' ', $temp ); |
$temp = preg_replace('/ spp /', ' ', $temp ); |
|
//$this->debug['parse'][] = "2b5 (temp:$temp)"; |
|
273,7 → 264,7 |
// now presume first element is genus, second (if present) is species, remainder |
// (if present) is authority |
// look for genus name |
$ar = explode( " ", $temp, 2); |
$ar = explode(' ', $temp, 2); |
if ( count( $ar ) ) { |
$temp_genus = $ar[0]; |
$temp = @$ar[1]; |
281,11 → 272,11 |
$temp_genus = $temp; |
$temp = ''; |
} |
|
|
//$this->debug['parse'][] = "2b7 (temp_genus:$temp_genus) (temp:$temp)"; |
|
// look for species epithet and authority |
$ar = explode( " ", $temp, 2); |
$ar = explode(' ', $temp, 2); |
if ( count( $ar ) ) { |
$temp_species = $ar[0]; |
$temp_authority = @$ar[1]; |
293,34 → 284,34 |
$temp_species = $temp; |
$temp_authority = ''; |
} |
// look for subspecies |
// look for subspecies |
|
$infras =array('subsp.','var.'); |
$infras =array('subsp.','var.'); |
|
$temp_authority = preg_replace( "/ssp./", "subsp.", $temp_authority); |
$temp_authority = preg_replace( "/ssp /", "subsp.", $temp_authority); |
$temp_authority = preg_replace( "/subsp /", "subsp.", $temp_authority); |
$temp_authority = preg_replace( "/var /", "var.", $temp_authority); |
$temp_authority = preg_replace( "/ssp./", "subsp.", $temp_authority); |
$temp_authority = preg_replace( "/ssp /", "subsp.", $temp_authority); |
$temp_authority = preg_replace( "/subsp /", "subsp.", $temp_authority); |
$temp_authority = preg_replace( "/var /", "var.", $temp_authority); |
|
$temp_infra_authority = ''; |
$temp_infra_type = ''; |
foreach ($infras as $infra) { |
$pos = strpos($temp_authority, $infra); |
if ($pos === false) { |
continue; |
} else { |
$temp_infra=substr($temp_authority,$pos+strlen($infra)); |
$temp_authority=substr($temp_authority,0,$pos); |
$temp_infra=trim($temp_infra); |
$temp_infra_type=$infra; |
// look for infra epithet and authority |
$ar = explode(" ", $temp_infra, 2); |
if ( count( $ar ) ) { |
$temp_infra = $ar[0]; |
$temp_infra_authority = @$ar[1]; |
} |
break; // on s'arrete au premier trouve |
} |
$temp_infra_authority = ''; |
$temp_infra_type = ''; |
foreach ($infras as $infra) { |
$pos = strpos($temp_authority, $infra); |
if ($pos === false) { |
continue; |
} else { |
$temp_infra=substr($temp_authority,$pos+strlen($infra)); |
$temp_authority=substr($temp_authority,0,$pos); |
$temp_infra=trim($temp_infra); |
$temp_infra_type=$infra; |
// look for infra epithet and authority |
$ar = explode(' ', $temp_infra, 2); |
if ( count( $ar ) ) { |
$temp_infra = $ar[0]; |
$temp_infra_authority = @$ar[1]; |
} |
break; // on s'arrete au premier trouve |
} |
} |
|
//$this->debug['parse'][] = "2b8 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)"; |
332,27 → 323,26 |
|
//$this->debug['parse'][] = "2b9 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)"; |
|
$temp_genus= trim($temp_genus); |
$temp_genus= trim($temp_genus); |
$temp_species= trim($temp_species); |
$temp_infra= trim($temp_infra ); |
|
// reduce any new multiple internal spaces to single space, if present |
$temp_genus= $this->reduce_spaces( $temp_genus ); |
$temp_genus= $this->reduce_spaces( $temp_genus ); |
$temp_species= $this->reduce_spaces( $temp_species ); |
$temp_infra= $this->reduce_spaces( $temp_infra ); |
|
//$this->debug['parse'][] = "2b10 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)"; |
|
if (isset($temp_authority) && ($temp_authority!='') ) { |
$temp_authority=$this->parse_auth($temp_authority); |
} |
if (isset($temp_authority) && ($temp_authority!='') ) { |
$temp_authority=$this->parse_auth($temp_authority); |
} |
|
if (isset($temp_infra_authority) && ($temp_infra_authority!='') ) { |
$temp_infra_authority=$this->parse_auth($temp_infra_authority); |
} |
if (isset($temp_infra_authority) && ($temp_infra_authority!='') ) { |
$temp_infra_authority=$this->parse_auth($temp_infra_authority); |
} |
//$this->debug['parse'][] = "2b11 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)"; |
return array("genus"=>$temp_genus, "species"=>$temp_species, "authority"=>$temp_authority, "infra"=>$temp_infra, "infra_authority"=>$temp_infra_authority, "infra_type"=>$temp_infra_type); |
} |
} // End NameParser |
} // End Class |
?> |
return array("genus"=>$temp_genus, "species"=>$temp_species, "authority"=>$temp_authority, "infra"=>$temp_infra, "infra_authority"=>$temp_infra_authority, "infra_type"=>$temp_infra_type); |
} |
} |
} |