WebSVN – eFlore/Applications.cel – Blame – //branches/v2.7-grelinette/jrest/lib/NameParser.php

Rev	Author	Line No.	Line
862	aurelien	1	`<?php`
		2
		3	`/**`
		4	`* Taxamatch-Webservice PHP v1.0.0`
		5	`* @author Michael Giddens`
		6	`* @link http://www.silverbiology.com`
		7	`*/`
		8
		9	`/* Adapation par David Delon Decembre 2010 : gestion sous espece`
		10	`*/`
		11
		12
1318	aurelien	13	`/**`
		14	`* Class NameParser`
		15	`* Used to convert a string to a standarized format.`
		16	`*/`
		17	`class NameParser {`
		18
862	aurelien	19	`/**`
1318	aurelien	20	`* Whether to debug or nor`
		21	`* @var bool\|integer`
862	aurelien	22	`*/`
1318	aurelien	23	`public $debug_flag;`
862	aurelien	24
		25
		26
1318	aurelien	27	`/**`
		28	`* Constructor`
		29	`*/`
		30	`public function __construct( ) {`
		31	`}`
862	aurelien	32
1318	aurelien	33	`/**`
		34	`* Sets value to the method property`
		35	`* @param mixed $name class property name`
		36	`* @param mixed $value class property value`
		37	`*/`
		38	`public function set($name,$value) {`
		39	`$this->$name = $value;`
		40	`}`
862	aurelien	41
1318	aurelien	42	`/**`
		43	`* Reduce Spaces`
		44	`* This will reduce the string to only allow once space between characters`
		45	`* @param string $str : string to reduce space`
		46	`* @return string : string with only once space between characters`
		47	`*/`
		48	`private function reduce_spaces( $str ) {`
		49
		50	`$str = preg_replace("/ {2,}/", ' ', $str );`
		51	`$str = trim( $str );`
862	aurelien	52
1318	aurelien	53	`return( $str );`
		54	`}`
862	aurelien	55
1318	aurelien	56	`/**`
		57	`* Function: parse_auth`
		58	`* Purpose: Produce a parsed version of authority of a taxon name`
		59	`* @author Tony Rees (Tony.Rees@csiro.au)`
		60	`* Date created: March 2008`
		61	`* Inputs: authority string as str`
		62	`* Remarks:`
		63	`* (1) Performs authority expension of known abbreviated authornames using`
		64	`* table "auth_abbrev_test1" (must be available and populated with relevant content)`
		65	`* (2) Recognises "and", "et", "&" as equivalents (special case for "et al.") - all parsed to ampersand`
		66	`* (3) Recognises (e.g.) "Smith 1980" and "Smith, 1980" as equivalents - comma is removed in these cases`
		67	`* (4) Recognises (e.g.) "F. J. R. Taylor, 1980" and "F.J.R. Taylor, 1980" as equivalents -`
		68	`* extra space after full stops is ignored in these cases`
		69	`* (5) Returns uppercase string, diacritical marks intact`
		70	`*`
		71	`* @param string $str : authority string`
		72	`* @param integer $upcase : convert to uppercase if $upcase = 1`
		73	`* @return string : parsed author string`
		74	`*/`
		75	`public function parse_auth( $str, $upcase=1 ) {`
862	aurelien	76
1318	aurelien	77	`$this->debug['parse_auth'][] = "1";`
		78	`$temp = $str = trim($str);`
862	aurelien	79
		80	`if ( ($str == NULL) \|\| ($str == '') ) {`
		81	`$this->debug['parse_auth'][] = "1a";`
		82	`return '';`
1318	aurelien	83	`}`
862	aurelien	84
1318	aurelien	85	`if ( ( $temp == null ) \|\| ( $temp == '') ) {`
		86	`$this->debug['parse_auth'][] = "2a";`
		87	`return('');`
		88	`} else {`
862	aurelien	89
1318	aurelien	90	`$this->debug['parse_auth'][] = "2b";`
		91
		92	`// add space after full stops, except at end (NB, will also add spece before some close brackets)`
		93	`$temp = rtrim( str_replace('.', '. ', $temp) );`
		94	`$this->debug['parse_auth'][] = "4 (temp:$temp)";`
862	aurelien	95
1318	aurelien	96	`//normalise "et", "and" to ampersand (et al. is a special case)`
		97	`// if ( $temp like '% et al%' ) {`
		98	`if ( preg_match('/ et al/', $temp) ) {`
		99	`$temp = str_replace(' et al','zzzzz', $temp);`
		100	`$this->debug['parse_auth'][] = "4a (temp:$temp)";`
		101	`}`
862	aurelien	102
1318	aurelien	103	`$temp = str_replace(' et ',' & ', $temp );`
		104	`$temp = str_replace(' and ',' & ', $temp );`
995	aurelien	105
1318	aurelien	106	`$temp = str_replace('zzzzz',' et al', $temp);`
862	aurelien	107
1318	aurelien	108	`$this->debug['parse_auth'][] = "5 (temp:$temp)";`
862	aurelien	109
1318	aurelien	110	`//remove commas before dates (only)`
		111	`// like '%, 17%'`
		112	`if ( preg_match('/, 17/', $temp) ) {`
		113	`$temp = str_replace(', 17',' 17', $temp);`
		114	`$this->debug['parse_auth'][] = "5a (temp:$temp)";`
		115	`}`
		116
		117	`// like '%, 18%'`
		118	`if ( preg_match('/, 18/', $temp) ) {`
		119	`$temp = str_replace(', 18',' 18', $temp);`
		120	`$this->debug['parse_auth'][] = "5b (temp:$temp)";`
		121	`}`
862	aurelien	122
1318	aurelien	123	`// like '%, 19%'`
		124	`if ( preg_match('/, 19/', $temp) ) {`
		125	`$temp = str_replace(', 19',' 19', $temp);`
		126	`$this->debug['parse_auth'][] = "5c (temp:$temp)";`
		127	`}`
		128
		129	`// like '%, 20%'`
		130	`if ( preg_match('/, 20/', $temp) ) {`
		131	`$temp = str_replace(', 20',' 20', $temp);`
		132	`$this->debug['parse_auth'][] = "5d (temp:$temp)";`
		133	`}`
862	aurelien	134
1318	aurelien	135	`// reduce multiple internal spaces to single space`
		136	`$temp = $this->reduce_spaces( $temp );`
		137
		138	`// like '% -%'`
		139	`$temp = str_replace(' -', '-', $temp);`
		140
		141	`$this->debug['parse_auth'][] = "6 (temp:$temp)";`
		142
		143	`foreach( explode(' ', $temp) as $this_word ) {`
		144
		145	`//$this->debug['parse_auth'][] = "7 (this_word:$this_word)";`
		146	`$elapsed_chars = '';`
		147	`// like '(%'`
		148	`if ( preg_match('/^\(/', $this_word) ) {`
		149	`$elapsed_chars .= '(';`
		150	`$this_word = substr( $this_word, 1 );`
		151	`//$this->debug['parse_auth'][] = "7a (this_word:$this_word) (elapsed_chars:$elapsed_chars)";`
862	aurelien	152	`}`
		153
1318	aurelien	154	`// Add back the word to the final translation`
		155	`$elapsed_chars .= $this_word . ' ';`
		156	`//$this->debug['parse_auth'][] = "7c (this_word:$this_word) (elapsed_chars:$elapsed_chars)";`
862	aurelien	157	`}`
1318	aurelien	158
		159	`$elapsed_chars = $this->reduce_spaces( str_replace(' )', ')', $elapsed_chars) );`
		160
		161	`return trim( $elapsed_chars ) ;`
		162	`}`
862	aurelien	163
1318	aurelien	164	`}`
862	aurelien	165
1318	aurelien	166	`/**`
		167	`* Function: parse`
		168	`* Purpose: Produces parsed version of an input string (scientific name components)`
		169	`* @author Tony Rees (Tony.Rees@csiro.au)`
		170	`* Date created: June 2007-November 2008`
		171	`* Inputs: input string as str (this version presumes genus, genus+species, or`
		172	`* genus+species+authority)`
		173	`* Outputs: parsed version of input string, for match purposes`
		174	`* Remarks:`
		175	`* (1) Removes known text elements e.g.`
		176	`* 'aff.', 'cf.', 'subsp.', subgenera if enclosed in brackets, etc. as desired`
		177	`* (2) Removes accented and non A-Z characters other than full stops`
		178	`* (in scientific name portions)`
		179	`* (3) Returns uppercase scientific name (genus + species only)`
		180	`* plus unaltered (presumed) authority`
		181	`* examples;`
		182	`* Anabaena cf. flos-aquae Ralfs ex Born. et Flah. => ANABAENA FLOSAQUAE Ralfs`
		183	`* ex Born. et Flah.`
		184	`* Abisara lemÈe-pauli => ABISARA LEMEEPAULI`
		185	`* Fuc/us Vesiculos2us => FUCUS VESICULOSUS`
		186	`* Buffo ignicolor LacÈpËde, 1788 => BUFFO IGNICOLOR LacÈpËde, 1788`
		187	`* Barbatia (Mesocibota) bistrigata (Dunker, 1866) => BARBATIA BISTRIGATA (Dunker, 1866)`
		188	`* (4) Thus version does not handle genus+author, or genus+species+infraspecies`
		189	`* (second" good" term is presumed to be species epithet, anything after is`
		190	`* considered to be start of the authority), however could be adapted further as required`
		191	`* and actually it was done in this version for Tela Botanica`
		192	`* (5) There is a separate function "parse_auth" for normalizing authorities when required`
		193	`* (e.g. for authority comparisons)`
		194	`*`
		195	`* @param string $str : input string ( genus, genus+species, or genus+species+authority )`
		196	`* @return string : parsed string`
		197	`*/`
		198	`public function parse( $str = NULL ) {`
		199
		200	`unset($this->debug['parse']);`
862	aurelien	201
1318	aurelien	202	`$temp = '';`
		203	`$first_str_part = NULL;`
		204	`$second_str_part = NULL;`
		205	`$temp_genus = '';`
		206	`$temp_species = '';`
		207	`$temp_genus_species = '';`
		208	`$temp_authority = '';`
		209	`$temp_infra = '';`
862	aurelien	210
1318	aurelien	211	`//$this->debug['parse'][] = "1";`
862	aurelien	212
1318	aurelien	213	`if ( ($str == NULL) \|\| ( trim($str) == '') ) {`
		214	`//$this->debug[] = "N1a<br>";`
		215	`return '';`
		216	`} else {`
		217	`// trim any leading, trailing spaces or line feeds`
		218	`$temp = trim( $str );`
		219	`//$this->debug['parse'][] = "1b";`
		220	`}`
862	aurelien	221
1318	aurelien	222	`if ( $temp == NULL \|\| $temp == '') {`
		223	`//$this->debug['parse'][] = "2a";`
		224	`return '';`
		225	`} else {`
		226	`//$this->debug['parse'][] = "2b";`
862	aurelien	227
1318	aurelien	228	`// replace any HTML ampersands`
		229	`$set = array('%', '&', 'amp;%', 'AMP;%');`
		230	`$temp = str_replace( $set, '&', $temp );`
862	aurelien	231
1318	aurelien	232	`//$this->debug['parse'][] = "2b1 (temp:$temp)";`
862	aurelien	233
1318	aurelien	234	`// remove any content in angle brackets (e.g. html tags - <i>, </i>, etc.)`
		235	`$html_pattern = "(\<(/?[^\>]+)\>)";`
		236	`//? This should not just handle html tags but all <*>`
		237	`$temp = preg_replace( $html_pattern, '', $temp);`
		238	`//$this->debug['parse'][] = "2b2 (temp:$temp)";`
862	aurelien	239
1318	aurelien	240	`// if second term (only) is in round brackets, presume it is a subgenus or a comment and remove it`
		241	`// examples: Barbatia (Mesocibota) bistrigata (Dunker, 1866) => Barbatia bistrigata (Dunker, 1866)`
		242	`// Barbatia (?) bistrigata (Dunker, 1866) => Barbatia bistrigata (Dunker, 1866)`
		243	`// (obviously this will not suit genus + author alone, where first part of authorname is in brackets,`
		244	`// however this is very rare?? and in any case we are not supporting genus+authority in this version)`
		245	`//if ( $temp like '% (%)%'`
		246	`$temp = preg_replace( "/ $\w\W$/", '', $temp, 1 );`
		247	`//? Not sure if this will catch if`
		248	`//$this->debug['parse'][] = "2b3 (temp:$temp)";`
862	aurelien	249
1318	aurelien	250	`// if second term (only) is in square brackets, presume it is a comment and remove it`
		251	`// example: Aphis [?] ficus Theobald, [1918] => Aphis ficus Theobald, [1918]`
		252	`//if ( $temp like '% [%]%'`
		253	`$temp = preg_replace( "/ \[\w\W\]/", '', $temp, 1 );`
		254	`//? Not sure if this will catch if`
		255	`//$this->debug['parse'][] = "2b4 (temp:$temp)";`
862	aurelien	256
1318	aurelien	257	`// drop indicators of questionable id's - presume all are lowercase for now (could extend as needed)`
		258	`$temp = preg_replace( "/ cf /", " ", $temp );`
		259	`$temp = preg_replace( "/ cf\. /", " ", $temp );`
		260	`$temp = preg_replace( "/ near /", " ", $temp );`
		261	`$temp = preg_replace( "/ aff\. /", " ", $temp );`
		262	`$temp = preg_replace( "/ sp\. /", " ", $temp );`
		263	`$temp = preg_replace( "/ spp\. /", " ", $temp );`
		264	`$temp = preg_replace( "/ spp /", " ", $temp );`
862	aurelien	265
1318	aurelien	266	`//$this->debug['parse'][] = "2b5 (temp:$temp)";`
862	aurelien	267
1318	aurelien	268	`// eliminate or close up any stray spaces introduced by the above`
		269	`$temp = $this->reduce_spaces( $temp );`
862	aurelien	270
1318	aurelien	271	`//$this->debug['parse'][] = "2b6 (temp:$temp)";`
862	aurelien	272
1318	aurelien	273	`// now presume first element is genus, second (if present) is species, remainder`
		274	`// (if present) is authority`
		275	`// look for genus name`
		276	`$ar = explode( " ", $temp, 2);`
		277	`if ( count( $ar ) ) {`
		278	`$temp_genus = $ar[0];`
		279	`$temp = @$ar[1];`
		280	`} else {`
		281	`$temp_genus = $temp;`
		282	`$temp = '';`
		283	`}`
862	aurelien	284
1318	aurelien	285	`//$this->debug['parse'][] = "2b7 (temp_genus:$temp_genus) (temp:$temp)";`
862	aurelien	286
1318	aurelien	287	`// look for species epithet and authority`
		288	`$ar = explode( " ", $temp, 2);`
		289	`if ( count( $ar ) ) {`
		290	`$temp_species = $ar[0];`
		291	`$temp_authority = @$ar[1];`
		292	`} else {`
		293	`$temp_species = $temp;`
		294	`$temp_authority = '';`
		295	`}`
		296	`// look for subspecies`
862	aurelien	297
1318	aurelien	298	`$infras =array('subsp.','var.');`
862	aurelien	299
1318	aurelien	300	`$temp_authority = preg_replace( "/ssp./", "subsp.", $temp_authority);`
		301	`$temp_authority = preg_replace( "/ssp /", "subsp.", $temp_authority);`
		302	`$temp_authority = preg_replace( "/subsp /", "subsp.", $temp_authority);`
		303	`$temp_authority = preg_replace( "/var /", "var.", $temp_authority);`
862	aurelien	304
1318	aurelien	305	`$temp_infra_authority = '';`
		306	`$temp_infra_type = '';`
		307	`foreach ($infras as $infra) {`
		308	`$pos = strpos($temp_authority, $infra);`
		309	`if ($pos === false) {`
		310	`continue;`
		311	`} else {`
		312	`$temp_infra=substr($temp_authority,$pos+strlen($infra));`
		313	`$temp_authority=substr($temp_authority,0,$pos);`
		314	`$temp_infra=trim($temp_infra);`
		315	`$temp_infra_type=$infra;`
		316	`// look for infra epithet and authority`
		317	`$ar = explode(" ", $temp_infra, 2);`
		318	`if ( count( $ar ) ) {`
		319	`$temp_infra = $ar[0];`
		320	`$temp_infra_authority = @$ar[1];`
862	aurelien	321	`}`
1318	aurelien	322	`break; // on s'arrete au premier trouve`
862	aurelien	323	`}`
1318	aurelien	324	`}`
862	aurelien	325
1318	aurelien	326	`//$this->debug['parse'][] = "2b8 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";`
862	aurelien	327
1318	aurelien	328	`// replace selected ligatures here (Genus names can contain √Ü, OE ligature)`
		329	`$temp_genus = str_replace( '√Ü', 'AE', $temp_genus);`
		330	`$temp_species = str_replace( '√Ü', 'AE', $temp_species);`
		331	`$temp_infra = str_replace( '√Ü', 'AE', $temp_infra );`
862	aurelien	332
1318	aurelien	333	`//$this->debug['parse'][] = "2b9 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";`
862	aurelien	334
1318	aurelien	335	`$temp_genus= trim($temp_genus);`
		336	`$temp_species= trim($temp_species);`
		337	`$temp_infra= trim($temp_infra );`
862	aurelien	338
1318	aurelien	339	`// reduce any new multiple internal spaces to single space, if present`
		340	`$temp_genus= $this->reduce_spaces( $temp_genus );`
		341	`$temp_species= $this->reduce_spaces( $temp_species );`
		342	`$temp_infra= $this->reduce_spaces( $temp_infra );`
862	aurelien	343
1318	aurelien	344	`//$this->debug['parse'][] = "2b10 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";`
862	aurelien	345
1318	aurelien	346	`if (isset($temp_authority) && ($temp_authority!='') ) {`
		347	`$temp_authority=$this->parse_auth($temp_authority);`
		348	`}`
862	aurelien	349
1318	aurelien	350	`if (isset($temp_infra_authority) && ($temp_infra_authority!='') ) {`
		351	`$temp_infra_authority=$this->parse_auth($temp_infra_authority);`
		352	`}`
		353	`//$this->debug['parse'][] = "2b11 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";`
		354	`return array("genus"=>$temp_genus, "species"=>$temp_species, "authority"=>$temp_authority, "infra"=>$temp_infra, "infra_authority"=>$temp_infra_authority, "infra_type"=>$temp_infra_type);`
		355	`}`
		356	`} // End NameParser`
		357	`} // End Class`
		358	`?>`

Subversion Repositories eFlore/Applications.cel

(root)//branches/v2.7-grelinette/jrest/lib/NameParser.php – Rev 2432