Subversion Repositories eFlore/Projets.eflore-projets

Rev

Rev 880 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

<?php
/*
 * @author              Raphaël Droz <raphael@tela-botanica.org>
 * @copyright   Copyright (c) 2011, 2013 Tela Botanica (accueil@tela-botanica.org)
 * @license             http://www.cecill.info/licences/Licence_CeCILL_V2-fr.txt Licence CECILL
 * @license             http://www.gnu.org/licenses/gpl.html Licence GNU-GPL
 *
 * Corrige les erreurs de saisie de nom à l'aide d'une recherche via un index sphinx
 * pour les observations ayant un nom saisi et dont l'un au moins de nom_ret[nn],
 * nt ou famille est NULL ou vide.
 *
 */

// time php -d memory_limit=1024M sphinx-maj-nom-ret.php 0 > sphinx-maj.log
// 23 secondes

// settings
define('USE_NVJFL', FALSE);
define('ESCAPE_ON_SPHINX_SYNERROR', TRUE);

define('TRY_FORCE_START_LINE', TRUE);
define('TRY_SPLIT',                        TRUE);
define('TRY_EXACT',                        TRUE);
define('TRY_REF',                          TRUE);
define('TRY_SPLIT_AND_AUTEUR', FALSE);
define('TRY_REMOVE_L',             TRUE);

define('M_TRY_SPLIT',   0x01);
define('M_TRY_EXACT',   0x02);
define('M_TRY_REF',             0x04);
define('M_TRY_SPLIT_AND_AUTEUR',        0x08);

error_reporting(E_ALL);
$db = mysql_connect('localhost', 'root', '') or die('no mysql');
mysql_select_db('tb_cel', $db);
mysql_query("SET NAMES utf8", $db) or die('no sphinx');
$dbs = mysql_connect('127.0.0.1:9306', NULL, NULL, TRUE);

$req = <<<EOF
        SELECT id_observation, nom_sel, nom_sel_nn, nom_ret, nom_ret_nn, nt, famille, nom_referentiel
        FROM `cel_obs`
        WHERE nom_sel IS NOT NULL AND nom_sel != '' AND
                id_observation BETWEEN %d AND %d AND
                ( nom_ret IS NULL or nom_ret = ''
                  OR nt IS NULL or nt = 0 or nt = ''
                  OR famille IS NULL or famille = '' )
        LIMIT %d, %d
EOF;
// non: car nom_ret_nn peut-être légitimement à 0 [taxon identifié, sans nom_retenu]
// OR nom_ret_nn IS NULL or nom_ret_nn = 0 or nom_ret_nn = ''


array_shift($argv);
$start = array_shift($argv);
$max = array_shift($argv);
$chunk_size = array_shift($argv);

if(!$start) $start = 0;
// 1036314
if(!$max) $max = intval(mysql_fetch_assoc(mysql_query("SELECT MAX(id_observation) AS max FROM cel_obs", $db))['max']) + 1;
if(!$chunk_size) $chunk_size = 50000;


// escape sphinx
$from = array ( '\\', '(',')','|','-','!','@','~','"','&', '/', '^', '$', '=', "'", "\x00", "\n", "\r", "\x1a" );
$to   = array ( '\\\\', '\\\(','\\\)','\\\|','\\\-','\\\!','\\\@','\\\~','\\\"', '\\\&', '\\\/', '\\\^', '\\\$', '\\\=', "\\'", "\\x00", "\\n", "\\r", "\\x1a" );


$stats = ['no_nom_sel' => ['count' => 0, 'data' => [] ],
                  'not found' => ['count' => 0, 'data' => [] ],
                  'too many' => ['count' => 0, 'data' => [] ],
                  'fixable' => ['count' => 0, 'data' => [] ],
                  'sauvages' => ['count' => 0, 'data' => [] ],
                  'sphinx errors' => ['count' => 0, 'data' => [] ],
                  'ref pb' => ['count' => 0, 'data' => [] ], ];

$sphinx_req = sprintf("SELECT * FROM i_bdtfx %s WHERE MATCH('%%s') LIMIT 5", USE_NVJFL ? ", i_nvjfl" : "");

for($current = 0; $current < intval($max/$chunk_size) + 1; $current++) {
        // printf("current = %d, chunk_size = %d, max = %d (rmax = %d) [real limit: %d]\n", $current, $chunk_size, $max, intval($max/$chunk_size) + 1, $current*$chunk_size);
        // printf(strtr($req, "\n", " ") . "\n", $start, $max, $current*$chunk_size, $chunk_size);
        $data = mysql_query(sprintf($req, $start, $max, $current*$chunk_size, $chunk_size), $db);
        if(!$data) { var_dump(mysql_error()); die('end'); }
        while($d = mysql_fetch_assoc($data)) {
                $n  = trim($d['nom_sel']);
                //d: fprintf(STDERR, "$n\n");

                if(!$n) {
                        $stats['no_nom_sel']['count']++;
                        // $stats['no_nom_sel']['data'][] = [$d['id_observation'], $n];*/
                        continue;
                }

                if($n == 'Autre(s) espèce(s) (écrire le/les nom(s) dans les notes)' ||
                   $n == '-') {
                        $stats['sauvages']['count']++;
                        // $stats['sauvages']['data'][] = [$d['id_observation'], $n];
                        continue;
                }

                $MASQUE = 0;

                if(TRY_REMOVE_L) {
                        $n = str_replace(' L.','', $n);
                }

                $orig_n = $n;

        recherche:
                if(TRY_FORCE_START_LINE && !_has($MASQUE, M_TRY_EXACT)) {
                        $n = '^' . $n;
                }

                $s = mysql_query(sprintf($sphinx_req, $n), $dbs);


                if(!$s && ESCAPE_ON_SPHINX_SYNERROR) {
                        $s = mysql_query(sprintf($sphinx_req, str_replace($from,$to,$n)), $dbs);
                }
                if(!$s) {
                        $stats['sphinx errors']['count']++;
                        // $stats['sphinx errors']['data'][] = [$d['id_observation'], $orig_n];
                        continue;
                }

                $c = mysql_num_rows($s);
                //d: fprintf(STDERR, "\t search [nb:%d] \"%s\" (msk:%d)\n", $c, $n, $MASQUE);

                if($c == 0) {
                        if(TRY_SPLIT && !_has($MASQUE, M_TRY_SPLIT)) {
                                require_once('lib-split-auteur.php');
                                $MASQUE |= M_TRY_SPLIT;
                                // $n = RechercheInfosTaxonBeta::supprimerAuteur($orig_n);
                                // list($ret, $m) = RechercheInfosTaxonBeta::contientAuteur($orig_n);
                                $ret = RechercheInfosTaxonBeta::supprimerAuteurBis($orig_n, $m);
                                if($ret) {
                                        // printf("===================== SPLIT: contientAuteur \"%s\" [@%s @%s)\n", $orig_n, $ret, $m);
                                        $n = sprintf('%s @auteur %s', $ret, $m);
                                        goto recherche;
                                }
                        }
                        if(TRY_SPLIT_AND_AUTEUR && !_has($MASQUE, M_TRY_SPLIT_AND_AUTEUR) && strpos($orig_n, ' ') !== FALSE) {
                                require_once('lib-split-auteur.php');
                                $MASQUE |= M_TRY_SPLIT_AND_AUTEUR;
                                $ns = RechercheInfosTaxonBeta::supprimerAuteur($orig_n);
                                if($ns) {
                                        $a = trim(substr($orig_n, strlen($n)));
                                        $n = sprintf("%s @auteur %s", $ns, $a);
                                        // echo "===================== SPLIT N/A: $n\n";
                                        goto recherche;
                                }
                        }

                        $stats['not found']['count']++;
                        // $stats['not found']['data'][] = [$d['id_observation'], $orig_n];
                        continue;
                }

                if($c > 1) {

                        if($c == 2) {
                                if(mysql_fetch_array($s)['group_id'] !=
                                   mysql_fetch_array($s)['group_id']) {
                                        // recherche donne seulement 2 résultats dans 2 référentiels
                                        // potentiellement fixable si l'on peut se référer à $d['nom_referentiel']
                                        $stats['ref pb']['count']++;
                                        // $stats['ref pb']['data'][] = [$d['id_observation'], $orig_n];
                                        continue;
                                }
                        }

                        if(TRY_EXACT && !_has($MASQUE, M_TRY_EXACT)) {
                                $MASQUE |= M_TRY_EXACT;
                                $n = '"^' . trim($orig_n) . '$"';
                                goto recherche;
                        }
                        if(TRY_REF && isset($d['nom_referentiel']) && !_has($MASQUE, M_TRY_REF)) {
                                $MASQUE |= M_TRY_REF;
                                $n = $orig_n . ' @group_id ' . $d['nom_referentiel'];
                                goto recherche;
                        }

                        $stats['too many']['count']++;
                        // $stats['too many']['data'][] = [$d['id_observation'], $orig_n];
                        continue;
                }


        ok:
                $stats['fixable']['count']++;
                // $stats['fixable']['data'][] = [$d['id_observation'], $orig_n];

        }
}

function _has($v, $r) {
        return ($v & $r) == $r;
}


array_walk($stats, function(&$v) { unset($v['data']); });
print_r($stats);
printf("total traité: %d\n", array_sum(array_map(function($v) { return $v['count']; }, $stats)));