866 |
raphael |
1 |
<?php
|
|
|
2 |
/*
|
|
|
3 |
* @author Raphaël Droz <raphael@tela-botanica.org>
|
|
|
4 |
* @copyright Copyright (c) 2011, 2013 Tela Botanica (accueil@tela-botanica.org)
|
|
|
5 |
* @license http://www.cecill.info/licences/Licence_CeCILL_V2-fr.txt Licence CECILL
|
|
|
6 |
* @license http://www.gnu.org/licenses/gpl.html Licence GNU-GPL
|
|
|
7 |
*
|
|
|
8 |
* Corrige les erreurs de saisie de nom à l'aide d'une recherche via un index sphinx
|
|
|
9 |
* pour les observations ayant un nom saisi et dont l'un au moins de nom_ret[nn],
|
|
|
10 |
* nt ou famille est NULL ou vide.
|
|
|
11 |
*
|
|
|
12 |
*/
|
|
|
13 |
|
|
|
14 |
// time php -d memory_limit=1024M sphinx-maj-nom-ret.php 0 > sphinx-maj.log
|
|
|
15 |
// 23 secondes
|
|
|
16 |
|
876 |
raphael |
17 |
// settings
|
|
|
18 |
define('USE_NVJFL', FALSE);
|
|
|
19 |
define('ESCAPE_ON_SPHINX_SYNERROR', TRUE);
|
|
|
20 |
|
|
|
21 |
define('TRY_FORCE_START_LINE', TRUE);
|
|
|
22 |
define('TRY_SPLIT', TRUE);
|
|
|
23 |
define('TRY_EXACT', TRUE);
|
|
|
24 |
define('TRY_REF', TRUE);
|
|
|
25 |
define('TRY_SPLIT_AND_AUTEUR', FALSE);
|
|
|
26 |
define('TRY_REMOVE_L', TRUE);
|
|
|
27 |
|
|
|
28 |
define('M_TRY_SPLIT', 0x01);
|
|
|
29 |
define('M_TRY_EXACT', 0x02);
|
|
|
30 |
define('M_TRY_REF', 0x04);
|
|
|
31 |
define('M_TRY_SPLIT_AND_AUTEUR', 0x08);
|
|
|
32 |
|
866 |
raphael |
33 |
error_reporting(E_ALL);
|
876 |
raphael |
34 |
$db = mysql_connect('localhost', 'root', '') or die('no mysql');
|
866 |
raphael |
35 |
mysql_select_db('tb_cel', $db);
|
876 |
raphael |
36 |
mysql_query("SET NAMES utf8", $db) or die('no sphinx');
|
866 |
raphael |
37 |
$dbs = mysql_connect('127.0.0.1:9306', NULL, NULL, TRUE);
|
|
|
38 |
|
|
|
39 |
$req = <<<EOF
|
876 |
raphael |
40 |
SELECT id_observation, nom_sel, nom_sel_nn, nom_ret, nom_ret_nn, nt, famille, nom_referentiel
|
866 |
raphael |
41 |
FROM `cel_obs`
|
|
|
42 |
WHERE nom_sel IS NOT NULL AND nom_sel != '' AND
|
|
|
43 |
id_observation BETWEEN %d AND %d AND
|
|
|
44 |
( nom_ret IS NULL or nom_ret = ''
|
|
|
45 |
OR nt IS NULL or nt = 0 or nt = ''
|
|
|
46 |
OR famille IS NULL or famille = '' )
|
|
|
47 |
LIMIT %d, %d
|
|
|
48 |
EOF;
|
882 |
raphael |
49 |
// non: car nom_ret_nn peut-être légitimement à 0 [taxon identifié, sans nom_retenu]
|
|
|
50 |
// OR nom_ret_nn IS NULL or nom_ret_nn = 0 or nom_ret_nn = ''
|
866 |
raphael |
51 |
|
882 |
raphael |
52 |
|
866 |
raphael |
53 |
array_shift($argv);
|
|
|
54 |
$start = array_shift($argv);
|
|
|
55 |
$max = array_shift($argv);
|
|
|
56 |
$chunk_size = array_shift($argv);
|
|
|
57 |
|
|
|
58 |
if(!$start) $start = 0;
|
|
|
59 |
// 1036314
|
|
|
60 |
if(!$max) $max = intval(mysql_fetch_assoc(mysql_query("SELECT MAX(id_observation) AS max FROM cel_obs", $db))['max']) + 1;
|
|
|
61 |
if(!$chunk_size) $chunk_size = 50000;
|
|
|
62 |
|
|
|
63 |
|
|
|
64 |
// escape sphinx
|
|
|
65 |
$from = array ( '\\', '(',')','|','-','!','@','~','"','&', '/', '^', '$', '=', "'", "\x00", "\n", "\r", "\x1a" );
|
|
|
66 |
$to = array ( '\\\\', '\\\(','\\\)','\\\|','\\\-','\\\!','\\\@','\\\~','\\\"', '\\\&', '\\\/', '\\\^', '\\\$', '\\\=', "\\'", "\\x00", "\\n", "\\r", "\\x1a" );
|
|
|
67 |
|
|
|
68 |
|
|
|
69 |
$stats = ['no_nom_sel' => ['count' => 0, 'data' => [] ],
|
|
|
70 |
'not found' => ['count' => 0, 'data' => [] ],
|
|
|
71 |
'too many' => ['count' => 0, 'data' => [] ],
|
|
|
72 |
'fixable' => ['count' => 0, 'data' => [] ],
|
|
|
73 |
'sauvages' => ['count' => 0, 'data' => [] ],
|
876 |
raphael |
74 |
'sphinx errors' => ['count' => 0, 'data' => [] ],
|
|
|
75 |
'ref pb' => ['count' => 0, 'data' => [] ], ];
|
866 |
raphael |
76 |
|
876 |
raphael |
77 |
$sphinx_req = sprintf("SELECT * FROM i_bdtfx %s WHERE MATCH('%%s') LIMIT 5", USE_NVJFL ? ", i_nvjfl" : "");
|
866 |
raphael |
78 |
|
|
|
79 |
for($current = 0; $current < intval($max/$chunk_size) + 1; $current++) {
|
|
|
80 |
// printf("current = %d, chunk_size = %d, max = %d (rmax = %d) [real limit: %d]\n", $current, $chunk_size, $max, intval($max/$chunk_size) + 1, $current*$chunk_size);
|
|
|
81 |
// printf(strtr($req, "\n", " ") . "\n", $start, $max, $current*$chunk_size, $chunk_size);
|
|
|
82 |
$data = mysql_query(sprintf($req, $start, $max, $current*$chunk_size, $chunk_size), $db);
|
|
|
83 |
if(!$data) { var_dump(mysql_error()); die('end'); }
|
|
|
84 |
while($d = mysql_fetch_assoc($data)) {
|
876 |
raphael |
85 |
$n = trim($d['nom_sel']);
|
|
|
86 |
//d: fprintf(STDERR, "$n\n");
|
|
|
87 |
|
866 |
raphael |
88 |
if(!$n) {
|
|
|
89 |
$stats['no_nom_sel']['count']++;
|
|
|
90 |
// $stats['no_nom_sel']['data'][] = [$d['id_observation'], $n];*/
|
|
|
91 |
continue;
|
|
|
92 |
}
|
|
|
93 |
|
|
|
94 |
if($n == 'Autre(s) espèce(s) (écrire le/les nom(s) dans les notes)' ||
|
|
|
95 |
$n == '-') {
|
|
|
96 |
$stats['sauvages']['count']++;
|
|
|
97 |
// $stats['sauvages']['data'][] = [$d['id_observation'], $n];
|
|
|
98 |
continue;
|
|
|
99 |
}
|
|
|
100 |
|
876 |
raphael |
101 |
$MASQUE = 0;
|
|
|
102 |
|
|
|
103 |
if(TRY_REMOVE_L) {
|
|
|
104 |
$n = str_replace(' L.','', $n);
|
|
|
105 |
}
|
|
|
106 |
|
|
|
107 |
$orig_n = $n;
|
|
|
108 |
|
867 |
raphael |
109 |
recherche:
|
876 |
raphael |
110 |
if(TRY_FORCE_START_LINE && !_has($MASQUE, M_TRY_EXACT)) {
|
|
|
111 |
$n = '^' . $n;
|
|
|
112 |
}
|
|
|
113 |
|
|
|
114 |
$s = mysql_query(sprintf($sphinx_req, $n), $dbs);
|
|
|
115 |
|
|
|
116 |
|
|
|
117 |
if(!$s && ESCAPE_ON_SPHINX_SYNERROR) {
|
|
|
118 |
$s = mysql_query(sprintf($sphinx_req, str_replace($from,$to,$n)), $dbs);
|
|
|
119 |
}
|
866 |
raphael |
120 |
if(!$s) {
|
|
|
121 |
$stats['sphinx errors']['count']++;
|
876 |
raphael |
122 |
// $stats['sphinx errors']['data'][] = [$d['id_observation'], $orig_n];
|
866 |
raphael |
123 |
continue;
|
|
|
124 |
}
|
|
|
125 |
|
|
|
126 |
$c = mysql_num_rows($s);
|
876 |
raphael |
127 |
//d: fprintf(STDERR, "\t search [nb:%d] \"%s\" (msk:%d)\n", $c, $n, $MASQUE);
|
|
|
128 |
|
866 |
raphael |
129 |
if($c == 0) {
|
876 |
raphael |
130 |
if(TRY_SPLIT && !_has($MASQUE, M_TRY_SPLIT)) {
|
|
|
131 |
require_once('lib-split-auteur.php');
|
|
|
132 |
$MASQUE |= M_TRY_SPLIT;
|
|
|
133 |
// $n = RechercheInfosTaxonBeta::supprimerAuteur($orig_n);
|
|
|
134 |
// list($ret, $m) = RechercheInfosTaxonBeta::contientAuteur($orig_n);
|
|
|
135 |
$ret = RechercheInfosTaxonBeta::supprimerAuteurBis($orig_n, $m);
|
|
|
136 |
if($ret) {
|
|
|
137 |
// printf("===================== SPLIT: contientAuteur \"%s\" [@%s @%s)\n", $orig_n, $ret, $m);
|
|
|
138 |
$n = sprintf('%s @auteur %s', $ret, $m);
|
|
|
139 |
goto recherche;
|
|
|
140 |
}
|
|
|
141 |
}
|
|
|
142 |
if(TRY_SPLIT_AND_AUTEUR && !_has($MASQUE, M_TRY_SPLIT_AND_AUTEUR) && strpos($orig_n, ' ') !== FALSE) {
|
|
|
143 |
require_once('lib-split-auteur.php');
|
|
|
144 |
$MASQUE |= M_TRY_SPLIT_AND_AUTEUR;
|
|
|
145 |
$ns = RechercheInfosTaxonBeta::supprimerAuteur($orig_n);
|
|
|
146 |
if($ns) {
|
|
|
147 |
$a = trim(substr($orig_n, strlen($n)));
|
|
|
148 |
$n = sprintf("%s @auteur %s", $ns, $a);
|
|
|
149 |
// echo "===================== SPLIT N/A: $n\n";
|
|
|
150 |
goto recherche;
|
|
|
151 |
}
|
|
|
152 |
}
|
|
|
153 |
|
866 |
raphael |
154 |
$stats['not found']['count']++;
|
876 |
raphael |
155 |
// $stats['not found']['data'][] = [$d['id_observation'], $orig_n];
|
866 |
raphael |
156 |
continue;
|
|
|
157 |
}
|
|
|
158 |
|
|
|
159 |
if($c > 1) {
|
876 |
raphael |
160 |
|
|
|
161 |
if($c == 2) {
|
|
|
162 |
if(mysql_fetch_array($s)['group_id'] !=
|
|
|
163 |
mysql_fetch_array($s)['group_id']) {
|
|
|
164 |
// recherche donne seulement 2 résultats dans 2 référentiels
|
|
|
165 |
// potentiellement fixable si l'on peut se référer à $d['nom_referentiel']
|
|
|
166 |
$stats['ref pb']['count']++;
|
|
|
167 |
// $stats['ref pb']['data'][] = [$d['id_observation'], $orig_n];
|
|
|
168 |
continue;
|
|
|
169 |
}
|
|
|
170 |
}
|
|
|
171 |
|
|
|
172 |
if(TRY_EXACT && !_has($MASQUE, M_TRY_EXACT)) {
|
|
|
173 |
$MASQUE |= M_TRY_EXACT;
|
|
|
174 |
$n = '"^' . trim($orig_n) . '$"';
|
|
|
175 |
goto recherche;
|
|
|
176 |
}
|
|
|
177 |
if(TRY_REF && isset($d['nom_referentiel']) && !_has($MASQUE, M_TRY_REF)) {
|
|
|
178 |
$MASQUE |= M_TRY_REF;
|
|
|
179 |
$n = $orig_n . ' @group_id ' . $d['nom_referentiel'];
|
|
|
180 |
goto recherche;
|
|
|
181 |
}
|
|
|
182 |
|
866 |
raphael |
183 |
$stats['too many']['count']++;
|
876 |
raphael |
184 |
// $stats['too many']['data'][] = [$d['id_observation'], $orig_n];
|
866 |
raphael |
185 |
continue;
|
|
|
186 |
}
|
|
|
187 |
|
876 |
raphael |
188 |
|
|
|
189 |
ok:
|
866 |
raphael |
190 |
$stats['fixable']['count']++;
|
876 |
raphael |
191 |
// $stats['fixable']['data'][] = [$d['id_observation'], $orig_n];
|
866 |
raphael |
192 |
|
|
|
193 |
}
|
|
|
194 |
}
|
|
|
195 |
|
876 |
raphael |
196 |
function _has($v, $r) {
|
|
|
197 |
return ($v & $r) == $r;
|
|
|
198 |
}
|
|
|
199 |
|
|
|
200 |
|
866 |
raphael |
201 |
array_walk($stats, function(&$v) { unset($v['data']); });
|
|
|
202 |
print_r($stats);
|
|
|
203 |
printf("total traité: %d\n", array_sum(array_map(function($v) { return $v['count']; }, $stats)));
|