WebSVN – eFlore/Applications.cel – /trunk/jrest/services/NameParser.php

<?php

/**
 * Taxamatch-Webservice PHP v1.0.0
 * @author Michael Giddens
 * @link http://www.silverbiology.com
 */

 /* Adapation par David Delon Decembre 2010 : gestion sous espece
 */


        /**
         * Class NameParser
         * Used to convert a string to a standarized format.
         */
        class NameParser {

                /**
                 * Whether to debug or nor
                 * @var bool|integer
                 */
                public $debug_flag;



                /**
                 * Constructor 
                 */
                public function __construct( ) {
                }

                /**
                 * Sets value to the method property
                 * @param mixed $name class property name
                 * @param mixed $value class property value
                 */
                public function set($name,$value) {
                        $this->$name = $value;
                }


                /**
                 * Reduce Spaces
                 * This will reduce the string to only allow once space between characters
                 * @param string $str : string to reduce space
                 * @return string : string with only once space between characters
                 */
                private function reduce_spaces( $str ) {
                
                        $str = preg_replace("/ {2,}/", ' ', $str );
                        $str = trim( $str );
                        
                        return( $str );
                }


                /**
                 * Function: parse_auth
                 * Purpose: Produce a parsed version of authority of a taxon name
                 * @author Tony Rees (Tony.Rees@csiro.au)
                 * Date created: March 2008
                 * Inputs: authority string as str
                 * Remarks:
                 *  (1) Performs authority expension of known abbreviated authornames using
                 *   table "auth_abbrev_test1" (must be available and populated with relevant content)
                 *  (2) Recognises "and", "et", "&" as equivalents (special case for "et al.") - all parsed to ampersand
                 *  (3) Recognises (e.g.) "Smith 1980" and "Smith, 1980" as equivalents - comma is removed in these cases
                 *  (4) Recognises (e.g.) "F. J. R. Taylor, 1980" and "F.J.R. Taylor, 1980" as equivalents -
                 *      extra space after full stops is ignored in these cases
                 *  (5) Returns uppercase string, diacritical marks intact
                 *
                 * @param string $str : authority string
                 * @param integer $upcase : convert to uppercase if $upcase = 1
                 * @return string : parsed author string
                 */
                public function parse_auth( $str, $upcase=1 ) {

                        $this->debug['parse_auth'][] = "1";
                        $temp = $str = trim($str);
                        
                if ( ($str == NULL) || ($str == '') ) {
                                $this->debug['parse_auth'][] = "1a";
                    return '';
                        }

                        if ( ( $temp == null ) || ( $temp == '') ) {
                                $this->debug['parse_auth'][] = "2a";
                                return('');
                        } else {

                                $this->debug['parse_auth'][] = "2b";
                        
                                // add space after full stops, except at end (NB, will also add spece before some close brackets)
                                $temp = rtrim( str_replace('.', '. ', $temp) );
                                $this->debug['parse_auth'][] = "4 (temp:$temp)";
                                
                                //normalise "et", "and" to ampersand (et al. is a special case)
//                              if ( $temp like '% et al%' ) {
                                if ( ereg(' et al', $temp) ) {
                                        $temp = str_replace(' et al','zzzzz', $temp);
                                        $this->debug['parse_auth'][] = "4a (temp:$temp)";
                                }
                                
//                              $temp = str_replace(temp,' et ',' '||'&'||' ');
//                              $temp = replace(temp,' and ',' '||'&'||' ');
                                $temp = str_replace(' et ',' & ', $temp );
                                $temp = str_replace(' and ',' & ', $temp );
//                              if ( $temp like '%zzzzz%' ) {
//                              if ( ereg('zzzzz', $temp) ) {
                                $temp = str_replace('zzzzz',' et al', $temp);
//                              }

                                $this->debug['parse_auth'][] = "5 (temp:$temp)";
                                
                                //remove commas before dates (only)
                                //      like '%, 17%'
                                if ( ereg(', 17', $temp) ) {
                                        $temp = str_replace(', 17',' 17', $temp);
                                        $this->debug['parse_auth'][] = "5a (temp:$temp)";
                                }
                                
                                //      like '%, 18%'
                                if ( ereg(', 18', $temp) ) {
                                        $temp = str_replace(', 18',' 18', $temp);
                                        $this->debug['parse_auth'][] = "5b (temp:$temp)";
                                }
                                
                                //      like '%, 19%'
                                if ( ereg(', 19', $temp) ) {
                                        $temp = str_replace(', 19',' 19', $temp);
                                        $this->debug['parse_auth'][] = "5c (temp:$temp)";
                                }
                                
                                //      like '%, 20%'
                                if ( ereg(', 20', $temp) ) {
                                        $temp = str_replace(', 20',' 20', $temp);
                                        $this->debug['parse_auth'][] = "5d (temp:$temp)";
                                }
                                
                                // reduce multiple internal spaces to single space
                                $temp = $this->reduce_spaces( $temp );
                                
                                //      like '% -%'
                                $temp = str_replace(' -', '-', $temp);

                                $this->debug['parse_auth'][] = "6 (temp:$temp)";
                                
                                foreach( explode(' ', $temp) as $this_word ) {
                                
                                        $this->debug['parse_auth'][] = "7 (this_word:$this_word)";
                                        
                                        //      like '(%'
                                        if ( ereg('^\(', $this_word) ) {
                                                $elapsed_chars .= '(';
                                                $this_word = substr( $this_word, 1 );
                                                $this->debug['parse_auth'][] = "7a (this_word:$this_word) (elapsed_chars:$elapsed_chars)";
                                        }

                                        // Add back the word to the final translation
                                        $elapsed_chars .= $this_word . ' ';
                                        $this->debug['parse_auth'][] = "7c (this_word:$this_word) (elapsed_chars:$elapsed_chars)";
                                }
                                
                                $elapsed_chars = $this->reduce_spaces( str_replace(' )', ')', $elapsed_chars) );
                                
                                return trim( $elapsed_chars ) ;
                        }

                }
                
                /**
                 * Function: parse
                 * Purpose: Produces parsed version of an input string (scientific name components)
                 * @author Tony Rees (Tony.Rees@csiro.au)
                 * Date created: June 2007-November 2008
                 * Inputs: input string as str (this version presumes genus, genus+species, or
                 * genus+species+authority)
                 * Outputs: parsed version of input string, for match purposes
                 * Remarks:
                 *    (1) Removes known text elements e.g.
                 *      'aff.', 'cf.', 'subsp.', subgenera if enclosed in brackets, etc. as desired
                 *    (2) Removes accented and non A-Z characters other than full stops 
                 *       (in scientific name portions)
                 *    (3) Returns uppercase scientific name (genus + species only) 
                 *       plus unaltered (presumed) authority
                 *     examples;
                 *       Anabaena cf. flos-aquae Ralfs ex Born. et Flah. => ANABAENA FLOSAQUAE Ralfs 
                 *       ex Born. et Flah.
                 *       Abisara lemÈe-pauli => ABISARA LEMEEPAULI
                 *       Fuc/us Vesiculos2us => FUCUS VESICULOSUS
                 *       Buffo ignicolor LacÈpËde, 1788 => BUFFO IGNICOLOR LacÈpËde, 1788
                 *       Barbatia (Mesocibota) bistrigata (Dunker, 1866) => BARBATIA BISTRIGATA (Dunker, 1866)
                 *    (4) Thus version does not handle genus+author, or genus+species+infraspecies
                 *       (second" good" term is presumed to be species epithet, anything after is 
                 *       considered to be start of the authority), however could be adapted further as required
         *         and actually it was done in this version for Tela Botanica
                 *    (5) There is a separate function "parse_auth" for normalizing authorities when required
                 *      (e.g. for authority comparisons)
                 *
                 * @param string $str : input string ( genus, genus+species, or genus+species+authority )
                 * @return string : parsed string
                 */
                public function parse( $str = NULL ) {
                        
                        unset($this->debug['parse']);


                        $temp = '';
                        $first_str_part = NULL;
                        $second_str_part = NULL;
                        $temp_genus = '';
                        $temp_species = '';
                        $temp_genus_species = '';
                        $temp_authority = '';
                        $temp_infra = '';
                        
                        $this->debug['parse'][] = "1";

                        if ( ($str == NULL) || ( trim($str) == '') ) {
                                $this->debug[] = "N1a<br>";
                                return '';
                        } else {
                                //      trim any leading, trailing spaces or line feeds
                                $temp = trim( $str );
                                $this->debug['parse'][] = "1b";
                        }

                        if ( $temp == NULL || $temp == '') {
                                $this->debug['parse'][] = "2a";
                                return '';
                        } else {
                                $this->debug['parse'][] = "2b";

                                // replace any HTML ampersands
                                $set = array('%', '&', 'amp;%', 'AMP;%');
                                $temp = str_replace( $set, '&', $temp );

                                $this->debug['parse'][] = "2b1 (temp:$temp)";

                                // remove any content in angle brackets (e.g. html tags - <i>, </i>, etc.)
                                $html_pattern = "(\<(/?[^\>]+)\>)";
//? This should not just handle html tags but all <*>                           
                                $temp = preg_replace( $html_pattern, '', $temp);
                                $this->debug['parse'][] = "2b2 (temp:$temp)";

                                // if second term (only) is in round brackets, presume it is a subgenus or a comment and remove it
                                // examples: Barbatia (Mesocibota) bistrigata (Dunker, 1866) => Barbatia bistrigata (Dunker, 1866)
                                // Barbatia (?) bistrigata (Dunker, 1866) => Barbatia bistrigata (Dunker, 1866)
                                // (obviously this will not suit genus + author alone, where first part of authorname is in brackets,
                                // however this is very rare?? and in any case we are not supporting genus+authority in this version)
//if ( $temp like '% (%)%'
                                $temp = preg_replace( "/ \(\w*\W*\)/", '', $temp, 1 );
//? Not sure if this will catch if                              
                                $this->debug['parse'][] = "2b3 (temp:$temp)";

                                // if second term (only) is in square brackets, presume it is a comment and remove it
                                // example: Aphis [?] ficus Theobald, [1918] => Aphis ficus Theobald, [1918]            
//if ( $temp like '% [%]%'
                                $temp = preg_replace( "/ \[\w*\W*\]/", '', $temp, 1 );
//? Not sure if this will catch if                              
                                $this->debug['parse'][] = "2b4 (temp:$temp)";

                                // drop indicators of questionable id's - presume all are lowercase for now (could extend as needed)
                                $temp = preg_replace( "/ cf /", " ", $temp );
                                $temp = preg_replace( "/ cf\. /", " ", $temp );
                                $temp = preg_replace( "/ near /", " ", $temp );
                                $temp = preg_replace( "/ aff\. /", " ", $temp );
                                $temp = preg_replace( "/ sp\. /", " ", $temp );
                                $temp = preg_replace( "/ spp\. /", " ", $temp );
                                $temp = preg_replace( "/ spp /", " ", $temp );

                                $this->debug['parse'][] = "2b5 (temp:$temp)";

                                // eliminate or close up any stray spaces introduced by the above
                                $temp = $this->reduce_spaces( $temp );

                                $this->debug['parse'][] = "2b6 (temp:$temp)";

                                // now presume first element is genus, second (if present) is species, remainder
                                //   (if present) is authority
                                // look for genus name
                                $ar = explode( " ", $temp, 2);
                                if ( count( $ar ) ) {
                                        $temp_genus = $ar[0];
                                        $temp = @$ar[1];
                                } else {
                                        $temp_genus = $temp;
                                        $temp = '';
                                }
                                
                                $this->debug['parse'][] = "2b7 (temp_genus:$temp_genus) (temp:$temp)";

                                // look for species epithet and authority
                                $ar = explode( " ", $temp, 2);
                                if ( count( $ar ) ) {
                                        $temp_species = $ar[0];
                                        $temp_authority = @$ar[1];
                                } else {
                                        $temp_species = $temp;
                                        $temp_authority = '';
                                }
                // look for subspecies

                $infras =array('subsp.','var.');

                $temp_authority = preg_replace( "/ssp./", "subsp.", $temp_authority);
                $temp_authority = preg_replace( "/ssp /", "subsp.", $temp_authority);
                $temp_authority = preg_replace( "/subsp /", "subsp.", $temp_authority);
                $temp_authority = preg_replace( "/var /", "var.", $temp_authority);

                foreach ($infras as $infra) {
                    $pos = strpos($temp_authority, $infra);
                    if ($pos === false) {
                        continue;
                    }
                    else {
                        $temp_infra=substr($temp_authority,$pos+strlen($infra));
                        $temp_authority=substr($temp_authority,0,$pos);
                        $temp_infra=trim($temp_infra);
                        $temp_infra_type=$infra;
                        // look for infra epithet and authority
                        $ar = explode(" ", $temp_infra, 2);
                        if ( count( $ar ) ) {
                            $temp_infra = $ar[0];
                            $temp_infra_authority = @$ar[1];
                        }
                        break; // on s'arrete au premier trouve
                    }
                }

                                $this->debug['parse'][] = "2b8 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";


                                // replace selected ligatures here (Genus names can contain √Ü, OE ligature)
                                $temp_genus = str_replace( '√Ü', 'AE', $temp_genus);
                                $temp_species = str_replace( '√Ü', 'AE', $temp_species);
                                $temp_infra = str_replace( '√Ü', 'AE', $temp_infra );
                                

                                $this->debug['parse'][] = "2b9 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";

                $temp_genus= trim($temp_genus);
                                $temp_species= trim($temp_species);
                                $temp_infra= trim($temp_infra );

                                // reduce any new multiple internal spaces to single space, if present
                $temp_genus= $this->reduce_spaces( $temp_genus );
                                $temp_species= $this->reduce_spaces( $temp_species );
                                $temp_infra= $this->reduce_spaces( $temp_infra );

                                $this->debug['parse'][] = "2b10 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";

                if (isset($temp_authority) && ($temp_authority!='') ) {
                    $temp_authority=$this->parse_auth($temp_authority);
                }

                if (isset($temp_infra_authority) && ($temp_infra_authority!='') ) {
                    $temp_infra_authority=$this->parse_auth($temp_infra_authority);
                }


                                $this->debug['parse'][] = "2b11 (temp_genus:$temp_genus) (temp_species:$temp_species) (temp_authority:$temp_authority) (temp_infra:$temp_infra) (temp_infra_authority:$temp_infra_authority) (temp:$temp)";

                                return array("genus"=>$temp_genus, "species"=>$temp_species, "authority"=>$temp_authority, "infra"=>$temp_infra, "infra_authority"=>$temp_infra_authority, "infra_type"=>$temp_infra_type);
                                
                        }
                        
                } // End NameParser

                
                
        } // End Class

?>
Subversion Repositories eFlore/Applications.cel

(root)/trunk/jrest/services/NameParser.php @ 664 – Rev