Subversion Repositories eFlore/Applications.cel

Rev

Rev 2459 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
862 aurelien 1
<?php
2462 jpm 2
// declare(encoding='UTF-8');
862 aurelien 3
/**
2462 jpm 4
 * Classe permettant de convertir une chaine d'un nom scientifique en un format standard.
5
 *
6
 * Source orignale :
862 aurelien 7
 * Taxamatch-Webservice PHP v1.0.0
8
 * @author Michael Giddens
9
 * @link http://www.silverbiology.com
2462 jpm 10
 *
11
 *
12
 * @internal   Mininum PHP version : 5.2
13
 * @category   CEL
14
 * @package    Services
15
 * @subpackage Bibliothèques
16
 * @version    0.1
17
 * @author     Mathias CHOUET <mathias@tela-botanica.org>
18
 * @author     David DELON <david@clapas.net>
19
 * @author     Jean-Pascal MILCENT <jpm@tela-botanica.org>
20
 * @author     Aurelien PERONNET <aurelien@tela-botanica.org>
21
 * @license    GPL v3 <http://www.gnu.org/licenses/gpl.txt>
22
 * @license    CECILL v2 <http://www.cecill.info/licences/Licence_CeCILL_V2-en.txt>
23
 * @copyright  1999-2014 Tela Botanica <accueil@tela-botanica.org>
862 aurelien 24
 */
1318 aurelien 25
class NameParser {
26
 
862 aurelien 27
	/**
1318 aurelien 28
	 * Whether to debug or nor
29
	 * @var bool|integer
862 aurelien 30
	 */
1318 aurelien 31
	public $debug_flag;
862 aurelien 32
 
1318 aurelien 33
	/**
34
	 * Sets value to the method property
35
	 * @param mixed $name class property name
36
	 * @param mixed $value class property value
37
	 */
2462 jpm 38
	public function set($name, $value) {
1318 aurelien 39
		$this->$name = $value;
40
	}
862 aurelien 41
 
1318 aurelien 42
	/**
43
	 * Reduce Spaces
44
	 * This will reduce the string to only allow once space between characters
45
	 * @param string $str : string to reduce space
46
	 * @return string : string with only once space between characters
47
	 */
2462 jpm 48
	private function reduce_spaces($str) {
49
		$str = preg_replace('/ {2,}/', ' ', $str );
1318 aurelien 50
		$str = trim( $str );
51
		return( $str );
52
	}
862 aurelien 53
 
1318 aurelien 54
	/**
55
	 * Function: parse_auth
56
	 * Purpose: Produce a parsed version of authority of a taxon name
57
	 * @author Tony Rees (Tony.Rees@csiro.au)
58
	 * Date created: March 2008
59
	 * Inputs: authority string as str
60
	 * Remarks:
61
	 *  (1) Performs authority expension of known abbreviated authornames using
62
	 *   table "auth_abbrev_test1" (must be available and populated with relevant content)
63
	 *  (2) Recognises "and", "et", "&" as equivalents (special case for "et al.") - all parsed to ampersand
64
	 *  (3) Recognises (e.g.) "Smith 1980" and "Smith, 1980" as equivalents - comma is removed in these cases
65
	 *  (4) Recognises (e.g.) "F. J. R. Taylor, 1980" and "F.J.R. Taylor, 1980" as equivalents -
66
	 *      extra space after full stops is ignored in these cases
67
	 *  (5) Returns uppercase string, diacritical marks intact
68
	 *
69
	 * @param string $str : authority string
70
	 * @param integer $upcase : convert to uppercase if $upcase = 1
71
	 * @return string : parsed author string
72
	 */
2462 jpm 73
	public function parse_auth($str, $upcase = 1) {
1318 aurelien 74
		$this->debug['parse_auth'][] = "1";
75
		$temp = $str = trim($str);
2462 jpm 76
 
862 aurelien 77
  		if ( ($str == NULL) || ($str == '') ) {
78
				$this->debug['parse_auth'][] = "1a";
79
		    return '';
1318 aurelien 80
		}
862 aurelien 81
 
1318 aurelien 82
		if ( ( $temp == null ) || ( $temp == '') ) {
83
			$this->debug['parse_auth'][] = "2a";
84
			return('');
85
		} else {
862 aurelien 86
 
1318 aurelien 87
			$this->debug['parse_auth'][] = "2b";
2462 jpm 88
 
1318 aurelien 89
			// add space after full stops, except at end (NB, will also add spece before some close brackets)
90
			$temp = rtrim( str_replace('.', '. ', $temp) );
91
			$this->debug['parse_auth'][] = "4 (temp:$temp)";
2462 jpm 92
 
1318 aurelien 93
			//normalise "et", "and" to ampersand (et al. is a special case)
94
			// if ( $temp like '% et al%' ) {
95
			if ( preg_match('/ et al/', $temp) ) {
96
				$temp = str_replace(' et al','zzzzz', $temp);
97
				$this->debug['parse_auth'][] = "4a (temp:$temp)";
98
			}
2462 jpm 99
 
1318 aurelien 100
			$temp = str_replace(' et ',' & ', $temp );
101
			$temp = str_replace(' and ',' & ', $temp );
995 aurelien 102
 
1318 aurelien 103
			$temp = str_replace('zzzzz',' et al', $temp);
862 aurelien 104
 
1318 aurelien 105
			$this->debug['parse_auth'][] = "5 (temp:$temp)";
2462 jpm 106
 
1318 aurelien 107
			//remove commas before dates (only)
108
			//	like '%, 17%'
109
			if ( preg_match('/, 17/', $temp) ) {
110
				$temp = str_replace(', 17',' 17', $temp);
111
				$this->debug['parse_auth'][] = "5a (temp:$temp)";
112
			}
2462 jpm 113
 
1318 aurelien 114
			//	like '%, 18%'
115
			if ( preg_match('/, 18/', $temp) ) {
116
				$temp = str_replace(', 18',' 18', $temp);
117
				$this->debug['parse_auth'][] = "5b (temp:$temp)";
118
			}
2462 jpm 119
 
1318 aurelien 120
			//	like '%, 19%'
121
			if ( preg_match('/, 19/', $temp) ) {
122
				$temp = str_replace(', 19',' 19', $temp);
123
				$this->debug['parse_auth'][] = "5c (temp:$temp)";
124
			}
2462 jpm 125
 
1318 aurelien 126
			//	like '%, 20%'
127
			if ( preg_match('/, 20/', $temp) ) {
128
				$temp = str_replace(', 20',' 20', $temp);
129
				$this->debug['parse_auth'][] = "5d (temp:$temp)";
130
			}
2462 jpm 131
 
1318 aurelien 132
			// reduce multiple internal spaces to single space
133
			$temp = $this->reduce_spaces( $temp );
2462 jpm 134
 
1318 aurelien 135
			//	like '% -%'
136
			$temp = str_replace(' -', '-', $temp);
137
 
138
			$this->debug['parse_auth'][] = "6 (temp:$temp)";
2462 jpm 139
 
140
			foreach (explode(' ', $temp) as $this_word) {
1318 aurelien 141
				//$this->debug['parse_auth'][] = "7 (this_word:$this_word)";
142
				$elapsed_chars = '';
143
				//	like '(%'
144
				if ( preg_match('/^\(/', $this_word) ) {
145
					$elapsed_chars .= '(';
146
					$this_word = substr( $this_word, 1 );
147
					//$this->debug['parse_auth'][] = "7a (this_word:$this_word) (elapsed_chars:$elapsed_chars)";
862 aurelien 148
				}
149
 
1318 aurelien 150
				// Add back the word to the final translation
151
				$elapsed_chars .= $this_word . ' ';
152
				//$this->debug['parse_auth'][] = "7c (this_word:$this_word) (elapsed_chars:$elapsed_chars)";
862 aurelien 153
			}
1318 aurelien 154
			$elapsed_chars = $this->reduce_spaces( str_replace(' )', ')', $elapsed_chars) );
155
			return trim( $elapsed_chars ) ;
156
		}
2462 jpm 157
	}
862 aurelien 158
 
1318 aurelien 159
	/**
160
	 * Function: parse
161
	 * Purpose: Produces parsed version of an input string (scientific name components)
162
	 * @author Tony Rees (Tony.Rees@csiro.au)
163
	 * Date created: June 2007-November 2008
164
	 * Inputs: input string as str (this version presumes genus, genus+species, or
165
	 * genus+species+authority)
166
	 * Outputs: parsed version of input string, for match purposes
167
	 * Remarks:
168
	 *    (1) Removes known text elements e.g.
169
	 *      'aff.', 'cf.', 'subsp.', subgenera if enclosed in brackets, etc. as desired
2462 jpm 170
	 *    (2) Removes accented and non A-Z characters other than full stops
1318 aurelien 171
	 *       (in scientific name portions)
2462 jpm 172
	 *    (3) Returns uppercase scientific name (genus + species only)
1318 aurelien 173
	 *       plus unaltered (presumed) authority
174
	 *     examples;
2462 jpm 175
	 *       Anabaena cf. flos-aquae Ralfs ex Born. et Flah. => ANABAENA FLOSAQUAE Ralfs
1318 aurelien 176
	 *       ex Born. et Flah.
177
	 *       Abisara lemÈe-pauli => ABISARA LEMEEPAULI
178
	 *       Fuc/us Vesiculos2us => FUCUS VESICULOSUS
179
	 *       Buffo ignicolor LacÈpËde, 1788 => BUFFO IGNICOLOR LacÈpËde, 1788
180
	 *       Barbatia (Mesocibota) bistrigata (Dunker, 1866) => BARBATIA BISTRIGATA (Dunker, 1866)
181
	 *    (4) Thus version does not handle genus+author, or genus+species+infraspecies
2462 jpm 182
	 *       (second" good" term is presumed to be species epithet, anything after is
1318 aurelien 183
	 *       considered to be start of the authority), however could be adapted further as required
184
     *         and actually it was done in this version for Tela Botanica
185
	 *    (5) There is a separate function "parse_auth" for normalizing authorities when required
186
	 *      (e.g. for authority comparisons)
187
	 *
188
	 * @param string $str : input string ( genus, genus+species, or genus+species+authority )
189
	 * @return string : parsed string
190
	 */
191
	public function parse( $str = NULL ) {
192
		unset($this->debug['parse']);
193
		$temp = '';
194
		$first_str_part = NULL;
195
		$second_str_part = NULL;
196
		$temp_genus = '';
197
		$temp_species = '';
198
		$temp_genus_species = '';
199
		$temp_authority = '';
200
		$temp_infra = '';
2462 jpm 201
 
1318 aurelien 202
		//$this->debug['parse'][] = "1";
862 aurelien 203
 
1318 aurelien 204
		if ( ($str == NULL) || ( trim($str) == '') ) {
205
			//$this->debug[] = "N1a<br>";
206
			return '';
207
		} else {
208
			//	trim any leading, trailing spaces or line feeds
209
			$temp = trim( $str );
210
			//$this->debug['parse'][] = "1b";
211
		}
862 aurelien 212
 
1318 aurelien 213
		if ( $temp == NULL || $temp == '') {
214
			//$this->debug['parse'][] = "2a";
215
			return '';
216
		} else {
217
			//$this->debug['parse'][] = "2b";
862 aurelien 218
 
1318 aurelien 219
			// replace any HTML ampersands
220
			$set = array('%', '&', 'amp;%', 'AMP;%');
221
			$temp = str_replace( $set, '&', $temp );
862 aurelien 222
 
1318 aurelien 223
			//$this->debug['parse'][] = "2b1 (temp:$temp)";
862 aurelien 224
 
1318 aurelien 225
			// remove any content in angle brackets (e.g. html tags - <i>, </i>, etc.)
2462 jpm 226
			$html_pattern = '(\<(/?[^\>]+)\>)';
227
			//? This should not just handle html tags but all <*>
1318 aurelien 228
			$temp = preg_replace( $html_pattern, '', $temp);
229
			//$this->debug['parse'][] = "2b2 (temp:$temp)";
862 aurelien 230
 
1318 aurelien 231
			// if second term (only) is in round brackets, presume it is a subgenus or a comment and remove it
232
			// examples: Barbatia (Mesocibota) bistrigata (Dunker, 1866) => Barbatia bistrigata (Dunker, 1866)
233
			// Barbatia (?) bistrigata (Dunker, 1866) => Barbatia bistrigata (Dunker, 1866)
234
			// (obviously this will not suit genus + author alone, where first part of authorname is in brackets,
235
			// however this is very rare?? and in any case we are not supporting genus+authority in this version)
236
			//if ( $temp like '% (%)%'
2462 jpm 237
			$temp = preg_replace( '/ \(\w*\W*\)/', '', $temp, 1 );
238
			//? Not sure if this will catch if
1318 aurelien 239
			//$this->debug['parse'][] = "2b3 (temp:$temp)";
862 aurelien 240
 
1318 aurelien 241
			// if second term (only) is in square brackets, presume it is a comment and remove it
2462 jpm 242
			// example: Aphis [?] ficus Theobald, [1918] => Aphis ficus Theobald, [1918]
1318 aurelien 243
			//if ( $temp like '% [%]%'
2462 jpm 244
			$temp = preg_replace( '/ \[\w*\W*\]/', '', $temp, 1 );
245
			//? Not sure if this will catch if
1318 aurelien 246
			//$this->debug['parse'][] = "2b4 (temp:$temp)";
862 aurelien 247
 
1318 aurelien 248
			// drop indicators of questionable id's - presume all are lowercase for now (could extend as needed)
2462 jpm 249
			$temp = preg_replace('/ cf /', ' ', $temp );
250
			$temp = preg_replace('/ cf\. /', ' ', $temp );
251
			$temp = preg_replace('/ near /', ' ', $temp );
252
			$temp = preg_replace('/ aff\. /', ' ', $temp );
253
			$temp = preg_replace('/ sp\. /', ' ', $temp );
254
			$temp = preg_replace('/ spp\. /', ' ', $temp );
255
			$temp = preg_replace('/ spp /', ' ', $temp );
862 aurelien 256
 
1318 aurelien 257
			//$this->debug['parse'][] = "2b5 (temp:$temp)";
862 aurelien 258
 
1318 aurelien 259
			// eliminate or close up any stray spaces introduced by the above
260
			$temp = $this->reduce_spaces( $temp );
862 aurelien 261
 
1318 aurelien 262
			//$this->debug['parse'][] = "2b6 (temp:$temp)";
862 aurelien 263
 
1318 aurelien 264
			// now presume first element is genus, second (if present) is species, remainder
265
			//   (if present) is authority
266
			// look for genus name
2462 jpm 267
			$ar = explode(' ', $temp, 2);
1318 aurelien 268
			if ( count( $ar ) ) {
269
				$temp_genus = $ar[0];
270
				$temp = @$ar[1];
271
			} else {
272
				$temp_genus = $temp;
273
				$temp = '';
274
			}
2462 jpm 275
 
1318 aurelien 276
			//$this->debug['parse'][] = "2b7 (temp_genus:$temp_genus) (temp:$temp)";
862 aurelien 277
 
1318 aurelien 278
			// look for species epithet and authority
2462 jpm 279
			$ar = explode(' ', $temp, 2);
1318 aurelien 280
			if ( count( $ar ) ) {
281
				$temp_species = $ar[0];
282
				$temp_authority = @$ar[1];
283
			} else {
284
				$temp_species = $temp;
285
				$temp_authority = '';
286
			}
2462 jpm 287
			// look for subspecies
862 aurelien 288
 
2462 jpm 289
			$infras =array('subsp.','var.');
862 aurelien 290
 
2462 jpm 291
			$temp_authority = preg_replace( "/ssp./", "subsp.", $temp_authority);
292
			$temp_authority = preg_replace( "/ssp /", "subsp.", $temp_authority);
293
			$temp_authority = preg_replace( "/subsp /", "subsp.", $temp_authority);
294
			$temp_authority = preg_replace( "/var /", "var.", $temp_authority);
862 aurelien 295
 
2462 jpm 296
			$temp_infra_authority = '';
297
			$temp_infra_type = '';
298
			foreach ($infras as $infra) {
299
				$pos = strpos($temp_authority, $infra);
300
				if ($pos === false) {
301
					continue;
302
				} else {
303
					$temp_infra=substr($temp_authority,$pos+strlen($infra));
304
					$temp_authority=substr($temp_authority,0,$pos);
305
					$temp_infra=trim($temp_infra);
306
					$temp_infra_type=$infra;
307
					// look for infra epithet and authority
308
					$ar = explode(' ', $temp_infra, 2);
309
					if ( count( $ar ) ) {
310
						$temp_infra = $ar[0];
311
						$temp_infra_authority = @$ar[1];
312
					}
313
					break; // on s'arrete au premier trouve
314
				}
1318 aurelien 315
			}
862 aurelien 316
 
1318 aurelien 317
			//$this->debug['parse'][] = "2b8 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";
862 aurelien 318
 
1318 aurelien 319
			// replace selected ligatures here (Genus names can contain Æ, OE ligature)
320
			$temp_genus = str_replace( 'Æ', 'AE', $temp_genus);
321
			$temp_species = str_replace( 'Æ', 'AE', $temp_species);
322
			$temp_infra = str_replace( 'Æ', 'AE', $temp_infra );
862 aurelien 323
 
1318 aurelien 324
			//$this->debug['parse'][] = "2b9 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";
862 aurelien 325
 
2462 jpm 326
			$temp_genus= trim($temp_genus);
1318 aurelien 327
			$temp_species= trim($temp_species);
328
			$temp_infra= trim($temp_infra );
862 aurelien 329
 
1318 aurelien 330
			// reduce any new multiple internal spaces to single space, if present
2462 jpm 331
			$temp_genus= $this->reduce_spaces( $temp_genus );
1318 aurelien 332
			$temp_species= $this->reduce_spaces( $temp_species );
333
			$temp_infra= $this->reduce_spaces( $temp_infra );
862 aurelien 334
 
1318 aurelien 335
			//$this->debug['parse'][] = "2b10 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";
862 aurelien 336
 
2462 jpm 337
			if (isset($temp_authority) && ($temp_authority!='') ) {
338
				$temp_authority=$this->parse_auth($temp_authority);
339
			}
862 aurelien 340
 
2462 jpm 341
			if (isset($temp_infra_authority) && ($temp_infra_authority!='') ) {
342
				$temp_infra_authority=$this->parse_auth($temp_infra_authority);
343
			}
1318 aurelien 344
			//$this->debug['parse'][] = "2b11 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";
2462 jpm 345
			return array("genus"=>$temp_genus, "species"=>$temp_species, "authority"=>$temp_authority, "infra"=>$temp_infra, "infra_authority"=>$temp_infra_authority, "infra_type"=>$temp_infra_type);
346
		}
347
	}
348
}