Subversion Repositories eFlore/Applications.cel

Compare Revisions

Ignore whitespace Rev 2459 → Rev 2462

/trunk/jrest/bibliotheque/NameParser.php
1,19 → 1,27
<?php
 
// declare(encoding='UTF-8');
/**
* Classe permettant de convertir une chaine d'un nom scientifique en un format standard.
*
* Source orignale :
* Taxamatch-Webservice PHP v1.0.0
* @author Michael Giddens
* @link http://www.silverbiology.com
*
*
* @internal Mininum PHP version : 5.2
* @category CEL
* @package Services
* @subpackage Bibliothèques
* @version 0.1
* @author Mathias CHOUET <mathias@tela-botanica.org>
* @author David DELON <david@clapas.net>
* @author Jean-Pascal MILCENT <jpm@tela-botanica.org>
* @author Aurelien PERONNET <aurelien@tela-botanica.org>
* @license GPL v3 <http://www.gnu.org/licenses/gpl.txt>
* @license CECILL v2 <http://www.cecill.info/licences/Licence_CeCILL_V2-en.txt>
* @copyright 1999-2014 Tela Botanica <accueil@tela-botanica.org>
*/
 
/* Adapation par David Delon Decembre 2010 : gestion sous espece
*/
 
 
/**
* Class NameParser
* Used to convert a string to a standarized format.
*/
class NameParser {
 
/**
22,20 → 30,12
*/
public $debug_flag;
 
 
 
/**
* Constructor
*/
public function __construct( ) {
}
 
/**
* Sets value to the method property
* @param mixed $name class property name
* @param mixed $value class property value
*/
public function set($name,$value) {
public function set($name, $value) {
$this->$name = $value;
}
 
45,11 → 45,9
* @param string $str : string to reduce space
* @return string : string with only once space between characters
*/
private function reduce_spaces( $str ) {
$str = preg_replace("/ {2,}/", ' ', $str );
private function reduce_spaces($str) {
$str = preg_replace('/ {2,}/', ' ', $str );
$str = trim( $str );
return( $str );
}
 
72,11 → 70,10
* @param integer $upcase : convert to uppercase if $upcase = 1
* @return string : parsed author string
*/
public function parse_auth( $str, $upcase=1 ) {
 
public function parse_auth($str, $upcase = 1) {
$this->debug['parse_auth'][] = "1";
$temp = $str = trim($str);
 
if ( ($str == NULL) || ($str == '') ) {
$this->debug['parse_auth'][] = "1a";
return '';
88,11 → 85,11
} else {
 
$this->debug['parse_auth'][] = "2b";
 
// add space after full stops, except at end (NB, will also add spece before some close brackets)
$temp = rtrim( str_replace('.', '. ', $temp) );
$this->debug['parse_auth'][] = "4 (temp:$temp)";
 
//normalise "et", "and" to ampersand (et al. is a special case)
// if ( $temp like '% et al%' ) {
if ( preg_match('/ et al/', $temp) ) {
99,7 → 96,7
$temp = str_replace(' et al','zzzzz', $temp);
$this->debug['parse_auth'][] = "4a (temp:$temp)";
}
 
$temp = str_replace(' et ',' & ', $temp );
$temp = str_replace(' and ',' & ', $temp );
 
106,7 → 103,7
$temp = str_replace('zzzzz',' et al', $temp);
 
$this->debug['parse_auth'][] = "5 (temp:$temp)";
 
//remove commas before dates (only)
// like '%, 17%'
if ( preg_match('/, 17/', $temp) ) {
113,35 → 110,34
$temp = str_replace(', 17',' 17', $temp);
$this->debug['parse_auth'][] = "5a (temp:$temp)";
}
 
// like '%, 18%'
if ( preg_match('/, 18/', $temp) ) {
$temp = str_replace(', 18',' 18', $temp);
$this->debug['parse_auth'][] = "5b (temp:$temp)";
}
 
// like '%, 19%'
if ( preg_match('/, 19/', $temp) ) {
$temp = str_replace(', 19',' 19', $temp);
$this->debug['parse_auth'][] = "5c (temp:$temp)";
}
 
// like '%, 20%'
if ( preg_match('/, 20/', $temp) ) {
$temp = str_replace(', 20',' 20', $temp);
$this->debug['parse_auth'][] = "5d (temp:$temp)";
}
 
// reduce multiple internal spaces to single space
$temp = $this->reduce_spaces( $temp );
 
// like '% -%'
$temp = str_replace(' -', '-', $temp);
 
$this->debug['parse_auth'][] = "6 (temp:$temp)";
foreach( explode(' ', $temp) as $this_word ) {
 
foreach (explode(' ', $temp) as $this_word) {
//$this->debug['parse_auth'][] = "7 (this_word:$this_word)";
$elapsed_chars = '';
// like '(%'
155,14 → 151,11
$elapsed_chars .= $this_word . ' ';
//$this->debug['parse_auth'][] = "7c (this_word:$this_word) (elapsed_chars:$elapsed_chars)";
}
$elapsed_chars = $this->reduce_spaces( str_replace(' )', ')', $elapsed_chars) );
return trim( $elapsed_chars ) ;
}
}
 
}
/**
* Function: parse
* Purpose: Produces parsed version of an input string (scientific name components)
174,12 → 167,12
* Remarks:
* (1) Removes known text elements e.g.
* 'aff.', 'cf.', 'subsp.', subgenera if enclosed in brackets, etc. as desired
* (2) Removes accented and non A-Z characters other than full stops
* (2) Removes accented and non A-Z characters other than full stops
* (in scientific name portions)
* (3) Returns uppercase scientific name (genus + species only)
* (3) Returns uppercase scientific name (genus + species only)
* plus unaltered (presumed) authority
* examples;
* Anabaena cf. flos-aquae Ralfs ex Born. et Flah. => ANABAENA FLOSAQUAE Ralfs
* Anabaena cf. flos-aquae Ralfs ex Born. et Flah. => ANABAENA FLOSAQUAE Ralfs
* ex Born. et Flah.
* Abisara lemÈe-pauli => ABISARA LEMEEPAULI
* Fuc/us Vesiculos2us => FUCUS VESICULOSUS
186,7 → 179,7
* Buffo ignicolor LacÈpËde, 1788 => BUFFO IGNICOLOR LacÈpËde, 1788
* Barbatia (Mesocibota) bistrigata (Dunker, 1866) => BARBATIA BISTRIGATA (Dunker, 1866)
* (4) Thus version does not handle genus+author, or genus+species+infraspecies
* (second" good" term is presumed to be species epithet, anything after is
* (second" good" term is presumed to be species epithet, anything after is
* considered to be start of the authority), however could be adapted further as required
* and actually it was done in this version for Tela Botanica
* (5) There is a separate function "parse_auth" for normalizing authorities when required
196,9 → 189,7
* @return string : parsed string
*/
public function parse( $str = NULL ) {
unset($this->debug['parse']);
 
$temp = '';
$first_str_part = NULL;
$second_str_part = NULL;
207,7 → 198,7
$temp_genus_species = '';
$temp_authority = '';
$temp_infra = '';
 
//$this->debug['parse'][] = "1";
 
if ( ($str == NULL) || ( trim($str) == '') ) {
232,8 → 223,8
//$this->debug['parse'][] = "2b1 (temp:$temp)";
 
// remove any content in angle brackets (e.g. html tags - <i>, </i>, etc.)
$html_pattern = "(\<(/?[^\>]+)\>)";
//? This should not just handle html tags but all <*>
$html_pattern = '(\<(/?[^\>]+)\>)';
//? This should not just handle html tags but all <*>
$temp = preg_replace( $html_pattern, '', $temp);
//$this->debug['parse'][] = "2b2 (temp:$temp)";
 
243,25 → 234,25
// (obviously this will not suit genus + author alone, where first part of authorname is in brackets,
// however this is very rare?? and in any case we are not supporting genus+authority in this version)
//if ( $temp like '% (%)%'
$temp = preg_replace( "/ \(\w*\W*\)/", '', $temp, 1 );
//? Not sure if this will catch if
$temp = preg_replace( '/ \(\w*\W*\)/', '', $temp, 1 );
//? Not sure if this will catch if
//$this->debug['parse'][] = "2b3 (temp:$temp)";
 
// if second term (only) is in square brackets, presume it is a comment and remove it
// example: Aphis [?] ficus Theobald, [1918] => Aphis ficus Theobald, [1918]
// example: Aphis [?] ficus Theobald, [1918] => Aphis ficus Theobald, [1918]
//if ( $temp like '% [%]%'
$temp = preg_replace( "/ \[\w*\W*\]/", '', $temp, 1 );
//? Not sure if this will catch if
$temp = preg_replace( '/ \[\w*\W*\]/', '', $temp, 1 );
//? Not sure if this will catch if
//$this->debug['parse'][] = "2b4 (temp:$temp)";
 
// drop indicators of questionable id's - presume all are lowercase for now (could extend as needed)
$temp = preg_replace( "/ cf /", " ", $temp );
$temp = preg_replace( "/ cf\. /", " ", $temp );
$temp = preg_replace( "/ near /", " ", $temp );
$temp = preg_replace( "/ aff\. /", " ", $temp );
$temp = preg_replace( "/ sp\. /", " ", $temp );
$temp = preg_replace( "/ spp\. /", " ", $temp );
$temp = preg_replace( "/ spp /", " ", $temp );
$temp = preg_replace('/ cf /', ' ', $temp );
$temp = preg_replace('/ cf\. /', ' ', $temp );
$temp = preg_replace('/ near /', ' ', $temp );
$temp = preg_replace('/ aff\. /', ' ', $temp );
$temp = preg_replace('/ sp\. /', ' ', $temp );
$temp = preg_replace('/ spp\. /', ' ', $temp );
$temp = preg_replace('/ spp /', ' ', $temp );
 
//$this->debug['parse'][] = "2b5 (temp:$temp)";
 
273,7 → 264,7
// now presume first element is genus, second (if present) is species, remainder
// (if present) is authority
// look for genus name
$ar = explode( " ", $temp, 2);
$ar = explode(' ', $temp, 2);
if ( count( $ar ) ) {
$temp_genus = $ar[0];
$temp = @$ar[1];
281,11 → 272,11
$temp_genus = $temp;
$temp = '';
}
 
//$this->debug['parse'][] = "2b7 (temp_genus:$temp_genus) (temp:$temp)";
 
// look for species epithet and authority
$ar = explode( " ", $temp, 2);
$ar = explode(' ', $temp, 2);
if ( count( $ar ) ) {
$temp_species = $ar[0];
$temp_authority = @$ar[1];
293,34 → 284,34
$temp_species = $temp;
$temp_authority = '';
}
// look for subspecies
// look for subspecies
 
$infras =array('subsp.','var.');
$infras =array('subsp.','var.');
 
$temp_authority = preg_replace( "/ssp./", "subsp.", $temp_authority);
$temp_authority = preg_replace( "/ssp /", "subsp.", $temp_authority);
$temp_authority = preg_replace( "/subsp /", "subsp.", $temp_authority);
$temp_authority = preg_replace( "/var /", "var.", $temp_authority);
$temp_authority = preg_replace( "/ssp./", "subsp.", $temp_authority);
$temp_authority = preg_replace( "/ssp /", "subsp.", $temp_authority);
$temp_authority = preg_replace( "/subsp /", "subsp.", $temp_authority);
$temp_authority = preg_replace( "/var /", "var.", $temp_authority);
 
$temp_infra_authority = '';
$temp_infra_type = '';
foreach ($infras as $infra) {
$pos = strpos($temp_authority, $infra);
if ($pos === false) {
continue;
} else {
$temp_infra=substr($temp_authority,$pos+strlen($infra));
$temp_authority=substr($temp_authority,0,$pos);
$temp_infra=trim($temp_infra);
$temp_infra_type=$infra;
// look for infra epithet and authority
$ar = explode(" ", $temp_infra, 2);
if ( count( $ar ) ) {
$temp_infra = $ar[0];
$temp_infra_authority = @$ar[1];
}
break; // on s'arrete au premier trouve
}
$temp_infra_authority = '';
$temp_infra_type = '';
foreach ($infras as $infra) {
$pos = strpos($temp_authority, $infra);
if ($pos === false) {
continue;
} else {
$temp_infra=substr($temp_authority,$pos+strlen($infra));
$temp_authority=substr($temp_authority,0,$pos);
$temp_infra=trim($temp_infra);
$temp_infra_type=$infra;
// look for infra epithet and authority
$ar = explode(' ', $temp_infra, 2);
if ( count( $ar ) ) {
$temp_infra = $ar[0];
$temp_infra_authority = @$ar[1];
}
break; // on s'arrete au premier trouve
}
}
 
//$this->debug['parse'][] = "2b8 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";
332,27 → 323,26
 
//$this->debug['parse'][] = "2b9 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";
 
$temp_genus= trim($temp_genus);
$temp_genus= trim($temp_genus);
$temp_species= trim($temp_species);
$temp_infra= trim($temp_infra );
 
// reduce any new multiple internal spaces to single space, if present
$temp_genus= $this->reduce_spaces( $temp_genus );
$temp_genus= $this->reduce_spaces( $temp_genus );
$temp_species= $this->reduce_spaces( $temp_species );
$temp_infra= $this->reduce_spaces( $temp_infra );
 
//$this->debug['parse'][] = "2b10 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";
 
if (isset($temp_authority) && ($temp_authority!='') ) {
$temp_authority=$this->parse_auth($temp_authority);
}
if (isset($temp_authority) && ($temp_authority!='') ) {
$temp_authority=$this->parse_auth($temp_authority);
}
 
if (isset($temp_infra_authority) && ($temp_infra_authority!='') ) {
$temp_infra_authority=$this->parse_auth($temp_infra_authority);
}
if (isset($temp_infra_authority) && ($temp_infra_authority!='') ) {
$temp_infra_authority=$this->parse_auth($temp_infra_authority);
}
//$this->debug['parse'][] = "2b11 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";
return array("genus"=>$temp_genus, "species"=>$temp_species, "authority"=>$temp_authority, "infra"=>$temp_infra, "infra_authority"=>$temp_infra_authority, "infra_type"=>$temp_infra_type);
}
} // End NameParser
} // End Class
?>
return array("genus"=>$temp_genus, "species"=>$temp_species, "authority"=>$temp_authority, "infra"=>$temp_infra, "infra_authority"=>$temp_infra_authority, "infra_type"=>$temp_infra_type);
}
}
}