New file |
0,0 → 1,203 |
<?php |
/* |
* @author Raphaël Droz <raphael@tela-botanica.org> |
* @copyright Copyright (c) 2011, 2013 Tela Botanica (accueil@tela-botanica.org) |
* @license http://www.cecill.info/licences/Licence_CeCILL_V2-fr.txt Licence CECILL |
* @license http://www.gnu.org/licenses/gpl.html Licence GNU-GPL |
* |
* Corrige les erreurs de saisie de nom à l'aide d'une recherche via un index sphinx |
* pour les observations ayant un nom saisi et dont l'un au moins de nom_ret[nn], |
* nt ou famille est NULL ou vide. |
* |
*/ |
|
// time php -d memory_limit=1024M sphinx-maj-nom-ret.php 0 > sphinx-maj.log |
// 23 secondes |
|
// settings |
define('USE_NVJFL', FALSE); |
define('ESCAPE_ON_SPHINX_SYNERROR', TRUE); |
|
define('TRY_FORCE_START_LINE', TRUE); |
define('TRY_SPLIT', TRUE); |
define('TRY_EXACT', TRUE); |
define('TRY_REF', TRUE); |
define('TRY_SPLIT_AND_AUTEUR', FALSE); |
define('TRY_REMOVE_L', TRUE); |
|
define('M_TRY_SPLIT', 0x01); |
define('M_TRY_EXACT', 0x02); |
define('M_TRY_REF', 0x04); |
define('M_TRY_SPLIT_AND_AUTEUR', 0x08); |
|
error_reporting(E_ALL); |
$db = mysql_connect('localhost', 'root', '') or die('no mysql'); |
mysql_select_db('tb_cel', $db); |
mysql_query("SET NAMES utf8", $db) or die('no sphinx'); |
$dbs = mysql_connect('127.0.0.1:9306', NULL, NULL, TRUE); |
|
$req = <<<EOF |
SELECT id_observation, nom_sel, nom_sel_nn, nom_ret, nom_ret_nn, nt, famille, nom_referentiel |
FROM `cel_obs` |
WHERE nom_sel IS NOT NULL AND nom_sel != '' AND |
id_observation BETWEEN %d AND %d AND |
( nom_ret IS NULL or nom_ret = '' |
OR nt IS NULL or nt = 0 or nt = '' |
OR famille IS NULL or famille = '' ) |
LIMIT %d, %d |
EOF; |
// non: car nom_ret_nn peut-être légitimement à 0 [taxon identifié, sans nom_retenu] |
// OR nom_ret_nn IS NULL or nom_ret_nn = 0 or nom_ret_nn = '' |
|
|
array_shift($argv); |
$start = array_shift($argv); |
$max = array_shift($argv); |
$chunk_size = array_shift($argv); |
|
if(!$start) $start = 0; |
// 1036314 |
if(!$max) $max = intval(mysql_fetch_assoc(mysql_query("SELECT MAX(id_observation) AS max FROM cel_obs", $db))['max']) + 1; |
if(!$chunk_size) $chunk_size = 50000; |
|
|
// escape sphinx |
$from = array ( '\\', '(',')','|','-','!','@','~','"','&', '/', '^', '$', '=', "'", "\x00", "\n", "\r", "\x1a" ); |
$to = array ( '\\\\', '\\\(','\\\)','\\\|','\\\-','\\\!','\\\@','\\\~','\\\"', '\\\&', '\\\/', '\\\^', '\\\$', '\\\=', "\\'", "\\x00", "\\n", "\\r", "\\x1a" ); |
|
|
$stats = ['no_nom_sel' => ['count' => 0, 'data' => [] ], |
'not found' => ['count' => 0, 'data' => [] ], |
'too many' => ['count' => 0, 'data' => [] ], |
'fixable' => ['count' => 0, 'data' => [] ], |
'sauvages' => ['count' => 0, 'data' => [] ], |
'sphinx errors' => ['count' => 0, 'data' => [] ], |
'ref pb' => ['count' => 0, 'data' => [] ], ]; |
|
$sphinx_req = sprintf("SELECT * FROM i_bdtfx %s WHERE MATCH('%%s') LIMIT 5", USE_NVJFL ? ", i_nvjfl" : ""); |
|
for($current = 0; $current < intval($max/$chunk_size) + 1; $current++) { |
// printf("current = %d, chunk_size = %d, max = %d (rmax = %d) [real limit: %d]\n", $current, $chunk_size, $max, intval($max/$chunk_size) + 1, $current*$chunk_size); |
// printf(strtr($req, "\n", " ") . "\n", $start, $max, $current*$chunk_size, $chunk_size); |
$data = mysql_query(sprintf($req, $start, $max, $current*$chunk_size, $chunk_size), $db); |
if(!$data) { var_dump(mysql_error()); die('end'); } |
while($d = mysql_fetch_assoc($data)) { |
$n = trim($d['nom_sel']); |
//d: fprintf(STDERR, "$n\n"); |
|
if(!$n) { |
$stats['no_nom_sel']['count']++; |
// $stats['no_nom_sel']['data'][] = [$d['id_observation'], $n];*/ |
continue; |
} |
|
if($n == 'Autre(s) espèce(s) (écrire le/les nom(s) dans les notes)' || |
$n == '-') { |
$stats['sauvages']['count']++; |
// $stats['sauvages']['data'][] = [$d['id_observation'], $n]; |
continue; |
} |
|
$MASQUE = 0; |
|
if(TRY_REMOVE_L) { |
$n = str_replace(' L.','', $n); |
} |
|
$orig_n = $n; |
|
recherche: |
if(TRY_FORCE_START_LINE && !_has($MASQUE, M_TRY_EXACT)) { |
$n = '^' . $n; |
} |
|
$s = mysql_query(sprintf($sphinx_req, $n), $dbs); |
|
|
if(!$s && ESCAPE_ON_SPHINX_SYNERROR) { |
$s = mysql_query(sprintf($sphinx_req, str_replace($from,$to,$n)), $dbs); |
} |
if(!$s) { |
$stats['sphinx errors']['count']++; |
// $stats['sphinx errors']['data'][] = [$d['id_observation'], $orig_n]; |
continue; |
} |
|
$c = mysql_num_rows($s); |
//d: fprintf(STDERR, "\t search [nb:%d] \"%s\" (msk:%d)\n", $c, $n, $MASQUE); |
|
if($c == 0) { |
if(TRY_SPLIT && !_has($MASQUE, M_TRY_SPLIT)) { |
require_once('lib-split-auteur.php'); |
$MASQUE |= M_TRY_SPLIT; |
// $n = RechercheInfosTaxonBeta::supprimerAuteur($orig_n); |
// list($ret, $m) = RechercheInfosTaxonBeta::contientAuteur($orig_n); |
$ret = RechercheInfosTaxonBeta::supprimerAuteurBis($orig_n, $m); |
if($ret) { |
// printf("===================== SPLIT: contientAuteur \"%s\" [@%s @%s)\n", $orig_n, $ret, $m); |
$n = sprintf('%s @auteur %s', $ret, $m); |
goto recherche; |
} |
} |
if(TRY_SPLIT_AND_AUTEUR && !_has($MASQUE, M_TRY_SPLIT_AND_AUTEUR) && strpos($orig_n, ' ') !== FALSE) { |
require_once('lib-split-auteur.php'); |
$MASQUE |= M_TRY_SPLIT_AND_AUTEUR; |
$ns = RechercheInfosTaxonBeta::supprimerAuteur($orig_n); |
if($ns) { |
$a = trim(substr($orig_n, strlen($n))); |
$n = sprintf("%s @auteur %s", $ns, $a); |
// echo "===================== SPLIT N/A: $n\n"; |
goto recherche; |
} |
} |
|
$stats['not found']['count']++; |
// $stats['not found']['data'][] = [$d['id_observation'], $orig_n]; |
continue; |
} |
|
if($c > 1) { |
|
if($c == 2) { |
if(mysql_fetch_array($s)['group_id'] != |
mysql_fetch_array($s)['group_id']) { |
// recherche donne seulement 2 résultats dans 2 référentiels |
// potentiellement fixable si l'on peut se référer à $d['nom_referentiel'] |
$stats['ref pb']['count']++; |
// $stats['ref pb']['data'][] = [$d['id_observation'], $orig_n]; |
continue; |
} |
} |
|
if(TRY_EXACT && !_has($MASQUE, M_TRY_EXACT)) { |
$MASQUE |= M_TRY_EXACT; |
$n = '"^' . trim($orig_n) . '$"'; |
goto recherche; |
} |
if(TRY_REF && isset($d['nom_referentiel']) && !_has($MASQUE, M_TRY_REF)) { |
$MASQUE |= M_TRY_REF; |
$n = $orig_n . ' @group_id ' . $d['nom_referentiel']; |
goto recherche; |
} |
|
$stats['too many']['count']++; |
// $stats['too many']['data'][] = [$d['id_observation'], $orig_n]; |
continue; |
} |
|
|
ok: |
$stats['fixable']['count']++; |
// $stats['fixable']['data'][] = [$d['id_observation'], $orig_n]; |
|
} |
} |
|
function _has($v, $r) { |
return ($v & $r) == $r; |
} |
|
|
array_walk($stats, function(&$v) { unset($v['data']); }); |
print_r($stats); |
printf("total traité: %d\n", array_sum(array_map(function($v) { return $v['count']; }, $stats))); |