| 866 | raphael | 1 | <?php
 | 
        
           |  |  | 2 | /*
 | 
        
           |  |  | 3 |  * @author		Raphaël Droz <raphael@tela-botanica.org>
 | 
        
           |  |  | 4 |  * @copyright	Copyright (c) 2011, 2013 Tela Botanica (accueil@tela-botanica.org)
 | 
        
           |  |  | 5 |  * @license		http://www.cecill.info/licences/Licence_CeCILL_V2-fr.txt Licence CECILL
 | 
        
           |  |  | 6 |  * @license		http://www.gnu.org/licenses/gpl.html Licence GNU-GPL
 | 
        
           |  |  | 7 |  *
 | 
        
           |  |  | 8 |  * Corrige les erreurs de saisie de nom à l'aide d'une recherche via un index sphinx
 | 
        
           |  |  | 9 |  * pour les observations ayant un nom saisi et dont l'un au moins de nom_ret[nn],
 | 
        
           |  |  | 10 |  * nt ou famille est NULL ou vide.
 | 
        
           |  |  | 11 |  *
 | 
        
           |  |  | 12 |  */
 | 
        
           |  |  | 13 |   | 
        
           |  |  | 14 | // time php -d memory_limit=1024M sphinx-maj-nom-ret.php 0 > sphinx-maj.log
 | 
        
           |  |  | 15 | // 23 secondes
 | 
        
           |  |  | 16 |   | 
        
           |  |  | 17 | error_reporting(E_ALL);
 | 
        
           |  |  | 18 | $db = mysql_connect('localhost', 'root', '');
 | 
        
           |  |  | 19 | mysql_select_db('tb_cel', $db);
 | 
        
           |  |  | 20 | mysql_query("SET NAMES utf8", $db);
 | 
        
           |  |  | 21 | $dbs = mysql_connect('127.0.0.1:9306', NULL, NULL, TRUE);
 | 
        
           |  |  | 22 |   | 
        
           |  |  | 23 | $req = <<<EOF
 | 
        
           |  |  | 24 | 	SELECT id_observation, nom_sel, nom_sel_nn,nom_ret,nom_ret_nn,nt,famille
 | 
        
           |  |  | 25 |         FROM `cel_obs`
 | 
        
           |  |  | 26 |         WHERE nom_sel IS NOT NULL AND nom_sel != '' AND
 | 
        
           |  |  | 27 | 		id_observation BETWEEN %d AND %d AND
 | 
        
           |  |  | 28 | 		( nom_ret IS NULL or nom_ret = ''
 | 
        
           |  |  | 29 | 		  OR nom_ret_nn IS NULL or nom_ret_nn = 0 or nom_ret_nn = ''
 | 
        
           |  |  | 30 | 		  OR nt IS NULL or nt = 0 or nt = ''
 | 
        
           |  |  | 31 | 		  OR famille IS NULL or famille = '' )
 | 
        
           |  |  | 32 | 	LIMIT %d, %d
 | 
        
           |  |  | 33 | EOF;
 | 
        
           |  |  | 34 |   | 
        
           |  |  | 35 | array_shift($argv);
 | 
        
           |  |  | 36 | $start = array_shift($argv);
 | 
        
           |  |  | 37 | $max = array_shift($argv);
 | 
        
           |  |  | 38 | $chunk_size = array_shift($argv);
 | 
        
           |  |  | 39 |   | 
        
           |  |  | 40 | if(!$start) $start = 0;
 | 
        
           |  |  | 41 | // 1036314
 | 
        
           |  |  | 42 | if(!$max) $max = intval(mysql_fetch_assoc(mysql_query("SELECT MAX(id_observation) AS max FROM cel_obs", $db))['max']) + 1;
 | 
        
           |  |  | 43 | if(!$chunk_size) $chunk_size = 50000;
 | 
        
           |  |  | 44 |   | 
        
           |  |  | 45 |   | 
        
           |  |  | 46 | // escape sphinx
 | 
        
           |  |  | 47 | $from = array ( '\\', '(',')','|','-','!','@','~','"','&', '/', '^', '$', '=', "'", "\x00", "\n", "\r", "\x1a" );
 | 
        
           |  |  | 48 | $to   = array ( '\\\\', '\\\(','\\\)','\\\|','\\\-','\\\!','\\\@','\\\~','\\\"', '\\\&', '\\\/', '\\\^', '\\\$', '\\\=', "\\'", "\\x00", "\\n", "\\r", "\\x1a" );
 | 
        
           |  |  | 49 |   | 
        
           |  |  | 50 |   | 
        
           |  |  | 51 | $stats = ['no_nom_sel' => ['count' => 0, 'data' => [] ],
 | 
        
           |  |  | 52 | 		  'not found' => ['count' => 0, 'data' => [] ],
 | 
        
           |  |  | 53 | 		  'too many' => ['count' => 0, 'data' => [] ],
 | 
        
           |  |  | 54 | 		  'fixable' => ['count' => 0, 'data' => [] ],
 | 
        
           |  |  | 55 | 		  'sauvages' => ['count' => 0, 'data' => [] ],
 | 
        
           |  |  | 56 | 		  'sphinx errors' => ['count' => 0, 'data' => [] ], ];
 | 
        
           |  |  | 57 |   | 
        
           |  |  | 58 |   | 
        
           |  |  | 59 | for($current = 0; $current < intval($max/$chunk_size) + 1; $current++) {
 | 
        
           |  |  | 60 | 	// printf("current = %d, chunk_size = %d, max = %d (rmax = %d) [real limit: %d]\n", $current, $chunk_size, $max, intval($max/$chunk_size) + 1, $current*$chunk_size);
 | 
        
           |  |  | 61 | 	// printf(strtr($req, "\n", " ") . "\n", $start, $max, $current*$chunk_size, $chunk_size);
 | 
        
           |  |  | 62 | 	$data = mysql_query(sprintf($req, $start, $max, $current*$chunk_size, $chunk_size), $db);
 | 
        
           |  |  | 63 | 	if(!$data) { var_dump(mysql_error()); die('end'); }
 | 
        
           |  |  | 64 | 	while($d = mysql_fetch_assoc($data)) {
 | 
        
           |  |  | 65 | 		$n  = $d['nom_sel'];
 | 
        
           |  |  | 66 | 		if(!$n) {
 | 
        
           |  |  | 67 | 			$stats['no_nom_sel']['count']++;
 | 
        
           |  |  | 68 | 			// $stats['no_nom_sel']['data'][] = [$d['id_observation'], $n];*/
 | 
        
           |  |  | 69 | 			continue;
 | 
        
           |  |  | 70 | 		}
 | 
        
           |  |  | 71 |   | 
        
           |  |  | 72 | 		if($n == 'Autre(s) espèce(s) (écrire le/les nom(s) dans les notes)' ||
 | 
        
           |  |  | 73 | 		   $n == '-') {
 | 
        
           |  |  | 74 | 			$stats['sauvages']['count']++;
 | 
        
           |  |  | 75 | 			// $stats['sauvages']['data'][] = [$d['id_observation'], $n];
 | 
        
           |  |  | 76 | 			continue;
 | 
        
           |  |  | 77 | 		}
 | 
        
           |  |  | 78 |   | 
        
           |  |  | 79 | 		//$s = mysql_query("SELECT * FROM i_bdtfx WHERE MATCH('" . str_replace($from,$to,$n) . "') LIMIT 5", $dbs);
 | 
        
           |  |  | 80 | 		$s = mysql_query("SELECT * FROM i_bdtfx WHERE MATCH('" . $n . "') LIMIT 5", $dbs);
 | 
        
           |  |  | 81 | 		if(!$s) {
 | 
        
           |  |  | 82 | 			$stats['sphinx errors']['count']++;
 | 
        
           |  |  | 83 | 			// $stats['sphinx errors']['data'][] = [$d['id_observation'], $n];
 | 
        
           |  |  | 84 | 			continue;
 | 
        
           |  |  | 85 | 		}
 | 
        
           |  |  | 86 |   | 
        
           |  |  | 87 | 		$c = mysql_num_rows($s);
 | 
        
           |  |  | 88 | 		if($c == 0) {
 | 
        
           |  |  | 89 | 			$stats['not found']['count']++;
 | 
        
           |  |  | 90 | 			// $stats['not found']['data'][] = [$d['id_observation'], $n];
 | 
        
           |  |  | 91 | 			continue;
 | 
        
           |  |  | 92 | 		}
 | 
        
           |  |  | 93 |   | 
        
           |  |  | 94 | 		if($c > 1) {
 | 
        
           |  |  | 95 | 			$stats['too many']['count']++;
 | 
        
           |  |  | 96 | 			// $stats['too many']['data'][] = [$d['id_observation'], $n];
 | 
        
           |  |  | 97 | 			continue;
 | 
        
           |  |  | 98 | 		}
 | 
        
           |  |  | 99 |   | 
        
           |  |  | 100 | 		$stats['fixable']['count']++;
 | 
        
           |  |  | 101 | 		// $stats['fixable']['data'][] = [$d['id_observation'], $n];
 | 
        
           |  |  | 102 |   | 
        
           |  |  | 103 | 	}
 | 
        
           |  |  | 104 | }
 | 
        
           |  |  | 105 |   | 
        
           |  |  | 106 | array_walk($stats, function(&$v) { unset($v['data']); });
 | 
        
           |  |  | 107 | print_r($stats);
 | 
        
           |  |  | 108 | printf("total traité: %d\n", array_sum(array_map(function($v) { return $v['count']; }, $stats)));
 |