Subversion Repositories eFlore/Projets.eflore-projets

Compare Revisions

Ignore whitespace Rev 876 → Rev 875

/trunk/scripts/modules/cel/lib-split-auteur.php
File deleted
\ No newline at end of file
/trunk/scripts/modules/cel/sphinx-maj-nom-ret.php
14,30 → 14,14
// time php -d memory_limit=1024M sphinx-maj-nom-ret.php 0 > sphinx-maj.log
// 23 secondes
 
// settings
define('USE_NVJFL', FALSE);
define('ESCAPE_ON_SPHINX_SYNERROR', TRUE);
 
define('TRY_FORCE_START_LINE', TRUE);
define('TRY_SPLIT', TRUE);
define('TRY_EXACT', TRUE);
define('TRY_REF', TRUE);
define('TRY_SPLIT_AND_AUTEUR', FALSE);
define('TRY_REMOVE_L', TRUE);
 
define('M_TRY_SPLIT', 0x01);
define('M_TRY_EXACT', 0x02);
define('M_TRY_REF', 0x04);
define('M_TRY_SPLIT_AND_AUTEUR', 0x08);
 
error_reporting(E_ALL);
$db = mysql_connect('localhost', 'root', '') or die('no mysql');
$db = mysql_connect('localhost', 'root', '');
mysql_select_db('tb_cel', $db);
mysql_query("SET NAMES utf8", $db) or die('no sphinx');
mysql_query("SET NAMES utf8", $db);
$dbs = mysql_connect('127.0.0.1:9306', NULL, NULL, TRUE);
 
$req = <<<EOF
SELECT id_observation, nom_sel, nom_sel_nn, nom_ret, nom_ret_nn, nt, famille, nom_referentiel
SELECT id_observation, nom_sel, nom_sel_nn,nom_ret,nom_ret_nn,nt,famille
FROM `cel_obs`
WHERE nom_sel IS NOT NULL AND nom_sel != '' AND
id_observation BETWEEN %d AND %d AND
69,10 → 53,8
'too many' => ['count' => 0, 'data' => [] ],
'fixable' => ['count' => 0, 'data' => [] ],
'sauvages' => ['count' => 0, 'data' => [] ],
'sphinx errors' => ['count' => 0, 'data' => [] ],
'ref pb' => ['count' => 0, 'data' => [] ], ];
'sphinx errors' => ['count' => 0, 'data' => [] ], ];
 
$sphinx_req = sprintf("SELECT * FROM i_bdtfx %s WHERE MATCH('%%s') LIMIT 5", USE_NVJFL ? ", i_nvjfl" : "");
 
for($current = 0; $current < intval($max/$chunk_size) + 1; $current++) {
// printf("current = %d, chunk_size = %d, max = %d (rmax = %d) [real limit: %d]\n", $current, $chunk_size, $max, intval($max/$chunk_size) + 1, $current*$chunk_size);
80,9 → 62,7
$data = mysql_query(sprintf($req, $start, $max, $current*$chunk_size, $chunk_size), $db);
if(!$data) { var_dump(mysql_error()); die('end'); }
while($d = mysql_fetch_assoc($data)) {
$n = trim($d['nom_sel']);
//d: fprintf(STDERR, "$n\n");
 
$n = $d['nom_sel'];
if(!$n) {
$stats['no_nom_sel']['count']++;
// $stats['no_nom_sel']['data'][] = [$d['id_observation'], $n];*/
96,106 → 76,34
continue;
}
 
$MASQUE = 0;
 
if(TRY_REMOVE_L) {
$n = str_replace(' L.','', $n);
}
 
$orig_n = $n;
 
recherche:
if(TRY_FORCE_START_LINE && !_has($MASQUE, M_TRY_EXACT)) {
$n = '^' . $n;
}
 
$s = mysql_query(sprintf($sphinx_req, $n), $dbs);
 
 
if(!$s && ESCAPE_ON_SPHINX_SYNERROR) {
$s = mysql_query(sprintf($sphinx_req, str_replace($from,$to,$n)), $dbs);
}
//$s = mysql_query("SELECT * FROM i_bdtfx WHERE MATCH('" . str_replace($from,$to,$n) . "') LIMIT 5", $dbs);
$s = mysql_query("SELECT * FROM i_bdtfx, i_nvjfl WHERE MATCH('" . $n . "') LIMIT 5", $dbs);
if(!$s) {
$stats['sphinx errors']['count']++;
// $stats['sphinx errors']['data'][] = [$d['id_observation'], $orig_n];
// $stats['sphinx errors']['data'][] = [$d['id_observation'], $n];
continue;
}
 
$c = mysql_num_rows($s);
//d: fprintf(STDERR, "\t search [nb:%d] \"%s\" (msk:%d)\n", $c, $n, $MASQUE);
 
if($c == 0) {
if(TRY_SPLIT && !_has($MASQUE, M_TRY_SPLIT)) {
require_once('lib-split-auteur.php');
$MASQUE |= M_TRY_SPLIT;
// $n = RechercheInfosTaxonBeta::supprimerAuteur($orig_n);
// list($ret, $m) = RechercheInfosTaxonBeta::contientAuteur($orig_n);
$ret = RechercheInfosTaxonBeta::supprimerAuteurBis($orig_n, $m);
if($ret) {
// printf("===================== SPLIT: contientAuteur \"%s\" [@%s @%s)\n", $orig_n, $ret, $m);
$n = sprintf('%s @auteur %s', $ret, $m);
goto recherche;
}
}
if(TRY_SPLIT_AND_AUTEUR && !_has($MASQUE, M_TRY_SPLIT_AND_AUTEUR) && strpos($orig_n, ' ') !== FALSE) {
require_once('lib-split-auteur.php');
$MASQUE |= M_TRY_SPLIT_AND_AUTEUR;
$ns = RechercheInfosTaxonBeta::supprimerAuteur($orig_n);
if($ns) {
$a = trim(substr($orig_n, strlen($n)));
$n = sprintf("%s @auteur %s", $ns, $a);
// echo "===================== SPLIT N/A: $n\n";
goto recherche;
}
}
 
$stats['not found']['count']++;
// $stats['not found']['data'][] = [$d['id_observation'], $orig_n];
// $stats['not found']['data'][] = [$d['id_observation'], $n];
continue;
}
 
if($c > 1) {
 
if($c == 2) {
if(mysql_fetch_array($s)['group_id'] !=
mysql_fetch_array($s)['group_id']) {
// recherche donne seulement 2 résultats dans 2 référentiels
// potentiellement fixable si l'on peut se référer à $d['nom_referentiel']
$stats['ref pb']['count']++;
// $stats['ref pb']['data'][] = [$d['id_observation'], $orig_n];
continue;
}
}
 
if(TRY_EXACT && !_has($MASQUE, M_TRY_EXACT)) {
$MASQUE |= M_TRY_EXACT;
$n = '"^' . trim($orig_n) . '$"';
goto recherche;
}
if(TRY_REF && isset($d['nom_referentiel']) && !_has($MASQUE, M_TRY_REF)) {
$MASQUE |= M_TRY_REF;
$n = $orig_n . ' @group_id ' . $d['nom_referentiel'];
goto recherche;
}
 
$stats['too many']['count']++;
// $stats['too many']['data'][] = [$d['id_observation'], $orig_n];
// $stats['too many']['data'][] = [$d['id_observation'], $n];
continue;
}
 
 
ok:
$stats['fixable']['count']++;
// $stats['fixable']['data'][] = [$d['id_observation'], $orig_n];
// $stats['fixable']['data'][] = [$d['id_observation'], $n];
 
}
}
 
function _has($v, $r) {
return ($v & $r) == $r;
}
 
 
array_walk($stats, function(&$v) { unset($v['data']); });
print_r($stats);
printf("total traité: %d\n", array_sum(array_map(function($v) { return $v['count']; }, $stats)));
/trunk/scripts/modules/cel/sphinx-maj.log
7,17 → 7,17
 
[not found] => Array
(
[count] => 6597
[count] => 5040
)
 
[too many] => Array
(
[count] => 1065
[count] => 4397
)
 
[fixable] => Array
(
[count] => 1448
[count] => 1064
)
 
[sauvages] => Array
27,13 → 27,8
 
[sphinx errors] => Array
(
[count] => 0
[count] => 152
)
 
[ref pb] => Array
(
[count] => 1543
)
 
)
total traité: 11243