Rev 867 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
<?php/** @author Raphaël Droz <raphael@tela-botanica.org>* @copyright Copyright (c) 2011, 2013 Tela Botanica (accueil@tela-botanica.org)* @license http://www.cecill.info/licences/Licence_CeCILL_V2-fr.txt Licence CECILL* @license http://www.gnu.org/licenses/gpl.html Licence GNU-GPL** Corrige les erreurs de saisie de nom à l'aide d'une recherche via un index sphinx* pour les observations ayant un nom saisi et dont l'un au moins de nom_ret[nn],* nt ou famille est NULL ou vide.**/// time php -d memory_limit=1024M sphinx-maj-nom-ret.php 0 > sphinx-maj.log// 23 secondeserror_reporting(E_ALL);$db = mysql_connect('localhost', 'root', '');mysql_select_db('tb_cel', $db);mysql_query("SET NAMES utf8", $db);$dbs = mysql_connect('127.0.0.1:9306', NULL, NULL, TRUE);$req = <<<EOFSELECT id_observation, nom_sel, nom_sel_nn,nom_ret,nom_ret_nn,nt,familleFROM `cel_obs`WHERE nom_sel IS NOT NULL AND nom_sel != '' ANDid_observation BETWEEN %d AND %d AND( nom_ret IS NULL or nom_ret = ''OR nom_ret_nn IS NULL or nom_ret_nn = 0 or nom_ret_nn = ''OR nt IS NULL or nt = 0 or nt = ''OR famille IS NULL or famille = '' )LIMIT %d, %dEOF;array_shift($argv);$start = array_shift($argv);$max = array_shift($argv);$chunk_size = array_shift($argv);if(!$start) $start = 0;// 1036314if(!$max) $max = intval(mysql_fetch_assoc(mysql_query("SELECT MAX(id_observation) AS max FROM cel_obs", $db))['max']) + 1;if(!$chunk_size) $chunk_size = 50000;// escape sphinx$from = array ( '\\', '(',')','|','-','!','@','~','"','&', '/', '^', '$', '=', "'", "\x00", "\n", "\r", "\x1a" );$to = array ( '\\\\', '\\\(','\\\)','\\\|','\\\-','\\\!','\\\@','\\\~','\\\"', '\\\&', '\\\/', '\\\^', '\\\$', '\\\=', "\\'", "\\x00", "\\n", "\\r", "\\x1a" );$stats = ['no_nom_sel' => ['count' => 0, 'data' => [] ],'not found' => ['count' => 0, 'data' => [] ],'too many' => ['count' => 0, 'data' => [] ],'fixable' => ['count' => 0, 'data' => [] ],'sauvages' => ['count' => 0, 'data' => [] ],'sphinx errors' => ['count' => 0, 'data' => [] ], ];for($current = 0; $current < intval($max/$chunk_size) + 1; $current++) {// printf("current = %d, chunk_size = %d, max = %d (rmax = %d) [real limit: %d]\n", $current, $chunk_size, $max, intval($max/$chunk_size) + 1, $current*$chunk_size);// printf(strtr($req, "\n", " ") . "\n", $start, $max, $current*$chunk_size, $chunk_size);$data = mysql_query(sprintf($req, $start, $max, $current*$chunk_size, $chunk_size), $db);if(!$data) { var_dump(mysql_error()); die('end'); }while($d = mysql_fetch_assoc($data)) {$n = $d['nom_sel'];if(!$n) {$stats['no_nom_sel']['count']++;// $stats['no_nom_sel']['data'][] = [$d['id_observation'], $n];*/continue;}if($n == 'Autre(s) espèce(s) (écrire le/les nom(s) dans les notes)' ||$n == '-') {$stats['sauvages']['count']++;// $stats['sauvages']['data'][] = [$d['id_observation'], $n];continue;}//$s = mysql_query("SELECT * FROM i_bdtfx WHERE MATCH('" . str_replace($from,$to,$n) . "') LIMIT 5", $dbs);$s = mysql_query("SELECT * FROM i_bdtfx WHERE MATCH('" . $n . "') LIMIT 5", $dbs);if(!$s) {$stats['sphinx errors']['count']++;// $stats['sphinx errors']['data'][] = [$d['id_observation'], $n];continue;}$c = mysql_num_rows($s);if($c == 0) {$stats['not found']['count']++;// $stats['not found']['data'][] = [$d['id_observation'], $n];continue;}if($c > 1) {$stats['too many']['count']++;// $stats['too many']['data'][] = [$d['id_observation'], $n];continue;}$stats['fixable']['count']++;// $stats['fixable']['data'][] = [$d['id_observation'], $n];}}array_walk($stats, function(&$v) { unset($v['data']); });print_r($stats);printf("total traité: %d\n", array_sum(array_map(function($v) { return $v['count']; }, $stats)));