Subversion Repositories Applications.papyrus

Rev

Rev 2141 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

<?php
/*vim: set expandtab tabstop=4 shiftwidth=4: */
// +------------------------------------------------------------------------------------------------------+
// | PHP version 4.1                                                                                      |
// +------------------------------------------------------------------------------------------------------+
// | Copyright (C) 2004 Tela Botanica (accueil@tela-botanica.org)                                         |
// +------------------------------------------------------------------------------------------------------+
// | This file is part of Papyrus.                                                                        |
// |                                                                                                      |
// | Foobar is free software; you can redistribute it and/or modify                                       |
// | it under the terms of the GNU General Public License as published by                                 |
// | the Free Software Foundation; either version 2 of the License, or                                    |
// | (at your option) any later version.                                                                  |
// |                                                                                                      |
// | Foobar is distributed in the hope that it will be useful,                                            |
// | but WITHOUT ANY WARRANTY; without even the implied warranty of                                       |
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                                        |
// | GNU General Public License for more details.                                                         |
// |                                                                                                      |
// | You should have received a copy of the GNU General Public License                                    |
// | along with Foobar; if not, write to the Free Software                                                |
// | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA                            |
// +------------------------------------------------------------------------------------------------------+
/**
 * Moteur de recherche SPHINX
 *
 * Installation de sphinx :
 * sudo urpmi lib64sphinxclient-devel
 * sudo pecl install sphinx
 *
 * See also:
 *  - http://www.ibm.com/developerworks/library/os-sphinx/
 *  - http://sphinxsearch.com/docs/manual-2.0.7.html#extended-syntax
 *
 * TODO: http://sphinxsearch.com/blog/2010/08/17/how-sphinx-relevance-ranking-works/
 * TODO: attention, projet,bazaar,spip et papyrus sont latin9, ainsi en est-il de l'input attendu du <form>
 * mais pour coste... c'est full utf-8
 * Structure retournée :
 *      'poids' => 0,
 *      'url' => '',
 *      'titre' => '',
 *      'hreflang' => '',
 *      'accesskey' => '',
 *      'title' => '', // balise 'title'
 *      'date_creation' => '',
 *      'description' => ''
 *
 *
 * // http://www.php.net/manual/fr/sphinx.examples.php
 * $s = new SphinxClient;
 * $s->setServer("localhost", 9306);
 * $s->setMatchMode(SPH_MATCH_ANY);
 * $s->setMaxQueryTime(3);
 * var_dump($s->query("test"));
 *
 * Note: conversion côté client SQL:
 * mysql -h0 -P 9306 < <(iconv -f utf8 -t latin1 <<<"SELECT * FROM i_projet, i_spip, i_papyrus, i_bazar WHERE MATCH('journée');")
 *
 *
 * Test :
 * ddiff
 * <(mysql -h0 -P 9306 <<<"SELECT main_id FROM i_projet, i_spip, i_papyrus, i_bazar, i_coste, i_nvjfl WHERE MATCH('test') LIMIT 50;"|awk '{print $3}'|sed 1d) \
 * <(POST http://localhost/site:reseau<<<"more_motif=test&"|awk -F'=>' '{print $2}'|sed -e 's/ //g' -e '/^$/d')
 *
 * both should be equal.
 * [ SELECT main_id, group_id FROM i_projet, i_spip, i_papyrus, i_bazar WHERE MATCH('test') LIMIT 50; ]
 *
 *
 *@package Applette
 *@subpackage Moteur_recherche
 //Auteur original :
 * @author              Raphaël Droz <raphael@tela-botanica.org
 //Autres auteurs :
 *@author               Jean-Pascal MILCENT <jpm@tela-botanica.org>
 *@copyright    Tela-Botanica 2000-2013
 *@version              $Revision$
 // +------------------------------------------------------------------------------------------------------+
 */

define('SPHINX_DSN', '193.54.123.216:9306');
define('_MRS_SPHINX_BASEHOST', $_SERVER['HTTP_HOST']);
// pour strftime()
date_default_timezone_set('Europe/Paris');
setlocale(LC_TIME, 'fr_FR');

function sphinx_search($q = NULL, $page = 1) {
        if(!$q) return array();

        // quelques aliases pour faciliter l'usage sans passer par le full sphinxQL
        // $q = preg_replace('/\<actu\>(.*)/', '\1 @group_id i_spip', $q);
        // $q = preg_replace('/\<eflore\>(.*)/', '\1 @group_id i_bazar|i_coste', $q);

        $db = mysql_connect(SPHINX_DSN, NULL, NULL, TRUE);
        // AFAICT, pas de réel risque de SQL-injection du côté de sphinx (au pire, $req = FALSE)
        // et il serait dommage de devoir limiter la puissante syntaxe offerte à  l'utilisation
        //$requeteTpl = "SELECT group_id, main_id FROM i_projet, i_spip, i_papyrus, i_bazar, i_coste, i_nvjfl WHERE MATCH('%s') LIMIT 50";
        $requeteTpl = 'SELECT group_id, main_id, id, WEIGHT() AS poids '.
                'FROM i_projet, i_spip, i_papyrus, i_bazar, i_coste, i_nvjfl '.
                "WHERE MATCH('%s') ".
                'LIMIT 50 ';
        $requete = mysql_query(sprintf($requeteTpl, $q), $db);

        $retour = array();
        if ($requete) {
                $res = array('spip' => array(), 'bazar' => array(), 'projet' => array(), 'papyrus' => array(), 'coste' => array(), 'nvjfl' => array());
                $ids_par_poids = array();
                $poidsMax = 0;
                while ($rec = mysql_fetch_array($requete, MYSQL_ASSOC)) {
                        $poidsMax = ($rec['poids'] > $poidsMax) ? $rec['poids'] : $poidsMax;
                        $res[$rec['group_id']][$rec['id']] = $rec;
                        $ids_par_poids[] = $rec['main_id'];
                }

                $docs = array();
                // spip
                $spip = new MoteurRecherche_SPIP();
                foreach ($spip->get(array_filter(array_keys($res['spip']), 'intval'), $q) as $v) {
                        unset($res['spip'][$v['id']]['group_id'],
                                $res['spip'][$v['id']]['main_id'],
                                $res['spip'][$v['id']]['id']);
                        $docs['spip-' . $v['id']] = array_merge($v,     $res['spip'][$v['id']]);
                }

                // bazar
                $bazar = new MoteurRecherche_BAZAR();
                foreach ($bazar->get(array_filter(array_keys($res['bazar']), 'intval'), $q) as $v) {
                        unset($res['bazar'][$v['id']]['group_id'],
                                $res['bazar'][$v['id']]['main_id'],
                                $res['bazar'][$v['id']]['id']);
                        $docs['bazar-' . $v['id']] = array_merge($v, $res['bazar'][$v['id']]);
                }

                // projet
                $projet = new MoteurRecherche_PROJET();
                foreach ($projet->get(array_filter(array_keys($res['projet']), 'intval'), $q) as $v) {
                        unset($res['projet'][$v['id']]['group_id'],
                                $res['projet'][$v['id']]['main_id'],
                                $res['projet'][$v['id']]['id']);
                        $docs['projet-' . $v['id']] = array_merge($v, $res['projet'][$v['id']]);
                }

                // papyrus
                $papyrus = new MoteurRecherche_PAPYRUS();
                foreach ($papyrus->get(array_filter(array_keys($res['papyrus']), 'intval'), $q) as $v) {
                        unset($res['papyrus'][$v['id']]['group_id'],
                                $res['papyrus'][$v['id']]['main_id'],
                                $res['papyrus'][$v['id']]['id']);
                        $docs['papyrus-' . $v['id']] = array_merge($v, $res['papyrus'][$v['id']]);
                }

                // coste
                $coste = new MoteurRecherche_COSTE();
                foreach ($coste->get(array_filter(array_keys($res['coste']), 'intval'), $q) as $v) {
                        unset($res['coste'][$v['id']]['group_id'],
                                $res['coste'][$v['id']]['main_id'],
                                $res['coste'][$v['id']]['id']);
                        $docs['coste-' . $v['id']] = array_merge($v, $res['coste'][$v['id']]);
                }

                // nvjfl
                $nvjfl = new MoteurRecherche_NVJFL();
                foreach ($nvjfl->get(array_filter(array_keys($res['nvjfl']), 'intval'), $q) as $v) {
                        unset($res['nvjfl'][$v['id']]['group_id'],
                                $res['nvjfl'][$v['id']]['main_id'],
                                $res['nvjfl'][$v['id']]['id']);
                        $docs['nvjfl-' . $v['id']] = array_merge($v, $res['nvjfl'][$v['id']]);
                }
                //die('<pre>'.print_r($docs, true).'</pre>');

                // sort
                $sorted = _sortArrayByArray($docs, $ids_par_poids);

                if (isset($_GET['tri']) && $_GET['tri'] == 'date') {
                        usort($sorted, '_actuNewerFirst');
                }
                // transforme les clefs pour s'adapter aux templates existants
                array_walk($sorted, '_weight2score', $poidsMax);

                // var_dump($sorted);die;
                $retour = $sorted;
        }
        return $retour;
}

class MoteurRecherche_SPIP {
        public function get($ids, $q = NULL) {
                $content = array();
                if (count($ids) > 0) {
                        $db = DB::connect($GLOBALS['_MOTEUR_RECHERCHE_']['spip'][0]['bdd_dsn']);
                        $requeteTpl = 'SELECT id_article AS id, titre, texte, date AS date_creation, lang as hreflang '.
                                        'FROM spip_articles '.
                                        'WHERE statut = "%s" '.
                                        'AND id_article IN (%s) ';
                        $requete = $db->query(sprintf($requeteTpl, 'publie',implode(',', $ids)));
                        (DB::isError($requete)) ? die($requete->getMessage()) : '';

                        while ($rec = $requete->fetchRow(DB_FETCHMODE_ASSOC)) {
                                $rec['url_simple'] = sprintf("%s/article%d.html",
                                         trim($GLOBALS['_MOTEUR_RECHERCHE_']['spip'][0]['url'], '/'),
                                         $rec['id']);
                                $rec['url'] = sprintf("%s?var_recherche=%s",
                                                $rec['url_simple'],
                                                More_Recherche::traiterMotif($q, 'url'));
                                $rec['description'] = More_Recherche::couperTexte($rec['texte'], MORE_RESULTAT_TAILLE_DESCRIPTION);
                                unset($rec['texte']);
                                $content[$rec['id']] = $rec;
                        }
                }
                return $content;
        }
}

class MoteurRecherche_BAZAR {
        public function get($ids, $q = NULL) {
                $content = array();
                if (count($ids) > 0) {
                        $db = DB::connect($GLOBALS['_MOTEUR_RECHERCHE_']['bazar'][0]['bdd_dsn']);
                        $requeteTpl = 'SELECT bf_id_fiche AS id, '.
                                        'bf_description AS texte, '.
                                        'bf_titre AS titre, '.
                                        'bf_date_debut_evenement AS date_creation '.
                                        'FROM bazar_fiche '.
                                        'WHERE bf_id_fiche IN (%s) ';
                        $requete = $db->query(sprintf($requeteTpl, implode(',', $ids)));
                        (DB::isError($requete)) ? die($requete->getMessage()) : '';

                        while ($rec = $requete->fetchRow(DB_FETCHMODE_ASSOC)) {
                                $rec['url_simple'] = $rec['url'] = sprintf(trim($GLOBALS['_MOTEUR_RECHERCHE_']['bazar'][0]['url'], '/'), $rec['id']);
                                $rec['description'] = More_Recherche::couperTexte($rec['texte'], MORE_RESULTAT_TAILLE_DESCRIPTION);
                                unset($rec['texte']);
                                $content[$rec['id']] = $rec;
                        }
                }
                return $content;
        }
}

class MoteurRecherche_PROJET {
        public function get($ids, $q = NULL) {
                $content = array();
                if (count($ids) > 0) {
                        $db = $GLOBALS['_MOTEUR_RECHERCHE_']['bd']['papyrus'];
                        $requeteTpl = 'SELECT p_id AS id, p_titre AS titre, p_description, p_date_creation AS date_creation '.
                                        'FROM projet '.
                                        'WHERE p_id IN (%s)';
                        $requete = $db->query(sprintf($requeteTpl, implode(',', $ids)));
                        (DB::isError($requete)) ? die($requete->getMessage()) : '';

                        while ($rec = $requete->fetchRow(DB_FETCHMODE_ASSOC)) {
                                $rec['url_simple'] = $rec['url'] = sprintf("%s?id_projet=%d",
                                        trim($GLOBALS['_MOTEUR_RECHERCHE_']['projet']['url'], '/'),
                                        $rec['id']);
                                $rec['description'] = substr(strip_tags($rec['p_description']), 0, 400 + 2 * MORE_RESULTAT_TAILLE_DESCRIPTION);
                                unset($rec['p_description']);
                                $content[$rec['id']] = $rec;
                        }
                }
                return $content;
        }
}

class MoteurRecherche_PAPYRUS {
        public function get($ids, $q = NULL) {
                $content = array();
                if (count($ids) > 0) {
                        $db = $GLOBALS['_MOTEUR_RECHERCHE_']['bd']['papyrus'];
                        $requeteTpl = 'SELECT mc.gmc_ce_menu AS id, '.
                                "       IF(gm_nom != '', gm_nom, IF(gm_titre != '', gm_titre, gm_titre_alternatif)) AS titre, ".
                                '       gmc_contenu AS texte, '.
                                '       gm_description_libre, gm_description_resume, '.
                                '       gm_mots_cles,gm_source, gm_auteur, gm_contributeur, gm_editeur, gm_categorie, '.
                                '       gm_date_creation AS date_creation '.
                                'FROM gen_menu AS m '.
                                '       LEFT JOIN gen_menu_contenu AS mc ON mc.gmc_ce_menu = m.gm_id_menu AND mc.gmc_bool_dernier = 1 '.
                                'WHERE mc.gmc_ce_menu IN (%s) ';
                        $requete = $db->query(sprintf($requeteTpl, implode(',', $ids)));
                        (DB::isError($requete)) ? die($requete->getMessage()) : '';

                        while ($rec = $requete->fetchRow(DB_FETCHMODE_ASSOC)) {
                                // Création de l'url
                                // TODO : utiliser comme pour spip un fichier de config spécifique pour virer PAP_URL d'ici
                                $une_url = new Pap_URL(PAP_URL);
                                $une_url->setId($rec['id']);
                                $rec['url_simple'] = $une_url->getURL();
                                $une_url->addQueryString('var_recherche', More_Recherche::traiterMotif($q, 'url'), true);
                                $rec['url'] = $une_url->getURL();

                                $rec['description'] =  htmlentities($rec['gm_description_resume']);
                                unset($rec['gm_description_resume']);
                                $content[$rec['id']] = $rec;
                        }
                }
                return $content;
        }
}

class MoteurRecherche_COSTE {
        public function get($ids, $q = NULL) {
                $content = array();
                if (count($ids) > 0) {
                        // DB access is dumb, let's use this one and pray
                        $db = $GLOBALS['_MOTEUR_RECHERCHE_']['bd']['bota'];
                        $requeteTpl = 'SELECT c.flore_bdtfx_nn AS id, c.nom_sci AS titre, dsc.body AS description '.
                                'FROM tb_eflore.coste_v2_00 AS c '.
                                "       LEFT JOIN tela_prod_wikini.florecoste_pages dsc ON c.page_wiki_dsc = dsc.tag AND dsc.latest = 'Y' ".
                                'WHERE c.flore_bdtfx_nn IN (%s) ';
                        $requete = $db->query(sprintf($requeteTpl, implode(',', $ids)));
                        (DB::isError($requete)) ? die($requete->getMessage()) : '';

                        while ($rec = $requete->fetchRow(DB_FETCHMODE_ASSOC)) {
                                $rec['url_simple'] = $rec['url'] = sprintf("http://%s/bdtfx-nn-%d", _MRS_SPHINX_BASEHOST, $rec['id']);
                                // TODO: interpret wikini
                                $rec['description'] = substr($rec['description'], 0, 400 + 2 * MORE_RESULTAT_TAILLE_DESCRIPTION);
                                $content[$rec['id']] = $rec;
                        }
                }
                return $content;
        }
}

class MoteurRecherche_NVJFL {
        public function get($ids, $q = NULL) {
                $content = array();
                if (count($ids) > 0) {
                        // DB access is dumb, let's use this one and pray
                        $db = $GLOBALS['_MOTEUR_RECHERCHE_']['bd']['bota'];
                        $requeteTpl = 'SELECT b.num_nom AS id, '.
                        "       CONCAT(nom_sci, ' (nn: ', b.num_nom, ', nt: ', num_taxonomique, ')') AS titre, ".
                        '       GROUP_CONCAT(n.nom_vernaculaire) AS description '.
                        'FROM tb_eflore.bdtfx_v1_01 AS b '.
                        '       LEFT JOIN tb_eflore.nvjfl_v2007 n ON n.num_taxon = b.num_taxonomique '.
                        'WHERE b.num_nom IN (%s) '.
                        'GROUP BY n.num_taxon ';
                        $requete = $db->query(sprintf($requeteTpl, implode(',', $ids)));

                        (DB::isError($requete)) ? die($requete->getMessage()) : '';

                        while ($rec = $requete->fetchRow(DB_FETCHMODE_ASSOC)) {
                                $rec['url_simple'] = $rec['url'] = sprintf("http://%s/bdtfx-nn-%d", _MRS_SPHINX_BASEHOST, $rec['id']);
                                $rec['description'] = substr($rec['description'], 0, 400 + 2 * MORE_RESULTAT_TAILLE_DESCRIPTION);
                                $content[$rec['id']] = $rec;
                        }
                }
                return $content;
        }
}

// http://stackoverflow.com/questions/348410/sort-an-array-based-on-another-array
function _sortArrayByArray($array, $orderArray) {
        $ordered = array();
        foreach ($orderArray as $key) {
                if (array_key_exists($key, $array)) {
                        $ordered[$key] = $array[$key];
                        unset($array[$key]);
                }
        }
        return $ordered + $array;
}

function _actuNewerFirst($a,$b) {
        return isset($a['date_creation']) && isset($b['date_creation']) ? strcmp($b['date_creation'], $a['date_creation']) : 0;
}

// Transforme un score en pourcentage
function _weight2score(&$item, $key, $max) {
        $item['score'] = intval($item['poids'] / $max * 100);
        $item['date_creation'] = isset($item['date_creation']) ? strftime("%d %B %Y", strtotime($item['date_creation'])) : '';
        unset($item['poids']);
}

?>