4 |
david |
1 |
<?php
|
|
|
2 |
|
|
|
3 |
//
|
|
|
4 |
// Lit un document 'pdf' et extrait son contenu en texte brut
|
|
|
5 |
//
|
|
|
6 |
|
|
|
7 |
// NOTE : l'extracteur n'est pas oblige de convertir le contenu dans
|
|
|
8 |
// le charset du site, mais il *doit* signaler le charset dans lequel
|
|
|
9 |
// il envoie le contenu, de facon a ce qu'il soit converti au moment
|
|
|
10 |
// voulu ; dans le cas contraire le document sera lu comme s'il etait
|
|
|
11 |
// dans le charset iso-8859-1
|
|
|
12 |
|
|
|
13 |
function extracteur_pdf($fichier, &$charset) {
|
|
|
14 |
|
|
|
15 |
/* methode tout PHP
|
|
|
16 |
$pdf = new Format_PDF;
|
|
|
17 |
$texte = $pdf->extraire_texte($fichier);
|
|
|
18 |
echo $texte;
|
|
|
19 |
exit;
|
|
|
20 |
*/
|
|
|
21 |
|
|
|
22 |
$charset = 'iso-8859-1';
|
|
|
23 |
|
|
|
24 |
# metamail
|
|
|
25 |
exec('metamail -d -q -b -c application/pdf '.escapeshellarg($fichier), $r, $e);
|
|
|
26 |
if (!$e) return join(' ', $r);
|
|
|
27 |
|
|
|
28 |
# pdftotext
|
|
|
29 |
# http://www.glyphandcog.com/Xpdf.html
|
|
|
30 |
# l'option "-enc utf-8" peut echouer ... dommage !
|
|
|
31 |
exec('pdftotext '.escapeshellarg($fichier).' -', $r, $e);
|
|
|
32 |
if (!$e) return join(' ', $r);
|
|
|
33 |
}
|
|
|
34 |
|
|
|
35 |
// Sait-on extraire ce format ?
|
|
|
36 |
// TODO: ici tester si les binaires fonctionnent
|
|
|
37 |
$GLOBALS['extracteur']['pdf'] = 'extracteur_pdf';
|
|
|
38 |
|
|
|
39 |
|
|
|
40 |
|
|
|
41 |
|
|
|
42 |
|
|
|
43 |
|
|
|
44 |
//
|
|
|
45 |
// Methode tout PHP (a tester)
|
|
|
46 |
//
|
|
|
47 |
|
|
|
48 |
class Format_PDF {
|
|
|
49 |
var $trans_chars;
|
|
|
50 |
var $flag_mono, $flag_brut;
|
|
|
51 |
|
|
|
52 |
function convertir_caracteres($texte) {
|
|
|
53 |
if (!$this->trans_chars) {
|
|
|
54 |
// Caracteres speciaux
|
|
|
55 |
$this->trans_chars = array(
|
|
|
56 |
// ligatures typographiques (!)
|
|
|
57 |
chr(2) => 'fi',
|
|
|
58 |
chr(3) => 'fl',
|
|
|
59 |
chr(174) => 'fi',
|
|
|
60 |
chr(175) => 'fl',
|
|
|
61 |
// "e" accent aigu
|
|
|
62 |
chr(0) => chr(233)
|
|
|
63 |
);
|
|
|
64 |
}
|
|
|
65 |
$texte = strtr($texte, $this->trans_chars);
|
|
|
66 |
// Caracteres non-ascii codes en octal
|
|
|
67 |
while (preg_match(',\\\\([0-7][0-7][0-7]),', $texte, $regs)) {
|
|
|
68 |
$c = chr(octdec($regs[1]));
|
|
|
69 |
$texte = str_replace($regs[0], $c, $texte);
|
|
|
70 |
$this->trans_chars[$regs[0]] = $c;
|
|
|
71 |
}
|
|
|
72 |
return $texte;
|
|
|
73 |
}
|
|
|
74 |
|
|
|
75 |
function recoller_texte($stream) {
|
|
|
76 |
static $chars_voyelles, $chars_fusion, $chars_caps, $chars_nums, $bichars_fusion;
|
|
|
77 |
if (!$chars_voyelles) {
|
|
|
78 |
$chars_voyelles = array('a'=>1, 'e'=>1, 'i'=>1, 'o'=>1, 'u'=>1, 'y'=>1);
|
|
|
79 |
$chars_fusion = array('v'=>1, 'w'=>1, 'x'=>1, 'V'=>1, 'W'=>1, 'T'=>1);
|
|
|
80 |
$chars_caps = array('A'=>1, 'B'=>1, 'C'=>1, 'D'=>1, 'E'=>1, 'F'=>1, 'G'=>1,
|
|
|
81 |
'H'=>1, 'I'=>1, 'J'=>1, 'K'=>1, 'L'=>1, 'M'=>1, 'N'=>1,
|
|
|
82 |
'O'=>1, 'P'=>1, 'Q'=>1, 'R'=>1, 'S'=>1, 'T'=>1, 'U'=>1,
|
|
|
83 |
'V'=>1, 'W'=>1, 'X'=>1, 'Y'=>1, 'Z'=>1);
|
|
|
84 |
$chars_nums = array('0'=>1, '1'=>1, '2'=>1, '3'=>1, '4'=>1, '5'=>1, '6'=>1, '7'=>1, '8'=>1, '9'=>1);
|
|
|
85 |
$bichars_fusion = array('ve'=>1, 'vo'=>1, 'ev'=>1, 'ov'=>1,
|
|
|
86 |
'xe'=>1, 'xo'=>1, 'ox'=>1, 'ex'=>1,
|
|
|
87 |
'we'=>1, 'wo'=>1, 'ow'=>1, 'ew'=>1, 'ff'=>1);
|
|
|
88 |
}
|
|
|
89 |
// Longueur max pour limiter les erreurs d'extraction
|
|
|
90 |
$chaine_len = 140;
|
|
|
91 |
|
|
|
92 |
$stream = preg_split(",\)[^(]*\(,", $stream);
|
|
|
93 |
$extrait = '';
|
|
|
94 |
$fini = false;
|
|
|
95 |
$this->flag_brut = false;
|
|
|
96 |
// Cette boucle est capable de basculer entre deux trois d'execution :
|
|
|
97 |
// - normal (plusieurs caracteres par chaine avec fusion)
|
|
|
98 |
// - brut (plusieurs caracteres par chaine sans fusion)
|
|
|
99 |
// - mono (un caractere par chaine)
|
|
|
100 |
while (1) {
|
|
|
101 |
if ($this->flag_mono) {
|
|
|
102 |
// Un caractere par chaine : fusion rapide
|
|
|
103 |
while (list(, $s) = each($stream)) {
|
|
|
104 |
if (strlen($s) != 1) {
|
|
|
105 |
if (strlen($s) < $chaine_len) $extrait .= $s;
|
|
|
106 |
$this->flag_mono = false;
|
|
|
107 |
break;
|
|
|
108 |
}
|
|
|
109 |
$extrait .= $s;
|
|
|
110 |
}
|
|
|
111 |
if ($this->flag_mono) break;
|
|
|
112 |
}
|
|
|
113 |
else if ($this->flag_brut) {
|
|
|
114 |
// Concatenation sans fusion
|
|
|
115 |
while (list(, $s) = each($stream)) $extrait .= $s;
|
|
|
116 |
break;
|
|
|
117 |
}
|
|
|
118 |
$prev_s = '';
|
|
|
119 |
$prev_c = '';
|
|
|
120 |
$prev_l = 0;
|
|
|
121 |
$nb_mono = 0;
|
|
|
122 |
$nb_brut = 0;
|
|
|
123 |
// Cas general : appliquer les regles de fusion
|
|
|
124 |
while (list(, $s) = each($stream)) {
|
|
|
125 |
$l = strlen($s);
|
|
|
126 |
if ($l >= $chaine_len) continue;
|
|
|
127 |
$c = $s{0};
|
|
|
128 |
// Annulation de la cesure
|
|
|
129 |
if ($prev_c == '-') {
|
|
|
130 |
$extrait .= substr($prev_s, 0, -1);
|
|
|
131 |
}
|
|
|
132 |
else {
|
|
|
133 |
$extrait .= $prev_s;
|
|
|
134 |
$len_w = strpos($s.' ', ' ');
|
|
|
135 |
$prev_len_w = $prev_l - strrpos($prev_s, ' ');
|
|
|
136 |
$court = ($prev_len_w < 3 OR $len_w < 3);
|
|
|
137 |
// Heuristique pour separation des mots
|
|
|
138 |
if (/*$len_w == 1 OR $prev_len_w == 1
|
|
|
139 |
OR */($court AND ($chars_fusion[$prev_c] OR $chars_fusion[$c]
|
|
|
140 |
OR ($chars_caps[$prev_c] AND ($chars_caps[$c] OR $chars_nums[$c]))))
|
|
|
141 |
OR ($prev_c == 'f' AND $chars_voyelles[$c])
|
|
|
142 |
OR $bichars_fusion[$prev_c.$c]) {
|
|
|
143 |
}
|
|
|
144 |
else $extrait .= ' ';
|
|
|
145 |
}
|
|
|
146 |
$prev_c = $s{$l - 1};
|
|
|
147 |
$prev_s = $s;
|
|
|
148 |
$prev_l = $l;
|
|
|
149 |
// Detection du format mono-caractere
|
|
|
150 |
if ($l == 1) {
|
|
|
151 |
if (++$nb_mono >= 3) {
|
|
|
152 |
$this->flag_mono = true;
|
|
|
153 |
break;
|
|
|
154 |
}
|
|
|
155 |
}
|
|
|
156 |
else {
|
|
|
157 |
$nb_mono = 0;
|
|
|
158 |
if ($c == ' ' OR $prev_c == ' ') {
|
|
|
159 |
$this->flag_brut = true;
|
|
|
160 |
break;
|
|
|
161 |
}
|
|
|
162 |
}
|
|
|
163 |
}
|
|
|
164 |
$extrait .= $prev_s;
|
|
|
165 |
if (!$this->flag_mono && !$this->flag_brut) break;
|
|
|
166 |
}
|
|
|
167 |
return $extrait;
|
|
|
168 |
}
|
|
|
169 |
|
|
|
170 |
function extraire_texte($fichier) {
|
|
|
171 |
|
|
|
172 |
$source_len = 1024*1024;
|
|
|
173 |
$stream_len = 20*1024;
|
|
|
174 |
$texte_len = 40*1024;
|
|
|
175 |
|
|
|
176 |
$f = fopen($fichier, "rb");
|
|
|
177 |
if (!$f) die ("Fichier $fichier impossible a ouvrir");
|
|
|
178 |
|
|
|
179 |
$in_stream = false;
|
|
|
180 |
|
|
|
181 |
// Decouper le fichier en objets
|
|
|
182 |
unset($objs);
|
|
|
183 |
$objs = fread($f, $source_len);
|
|
|
184 |
$objs = preg_split('/[\s>]endobj\s+/', $objs);
|
|
|
185 |
# echo "<h3>".count($objs)." objets présents dans le buffer</h3>";
|
|
|
186 |
|
|
|
187 |
// Parcourir le fichier pour trouver les streams
|
|
|
188 |
reset($objs);
|
|
|
189 |
$n = count($objs);
|
|
|
190 |
for ($i = 0; $i < $n; $i++) {
|
|
|
191 |
$obj = $objs[$i];
|
|
|
192 |
|
|
|
193 |
if (!$in_stream) {
|
|
|
194 |
// Stream (eviter les commentaires)
|
|
|
195 |
$ok = preg_match("/stream(\r\n?|\n)/", $obj); // version rapide d'abord
|
|
|
196 |
if ($ok) $ok = preg_match("/[\r\n](([^\r\n%]*[ \t>])*stream(\r\n?|\n))/", $obj, $regs);
|
|
|
197 |
if (!$ok) continue;
|
|
|
198 |
$p = strpos($obj, $regs[1]);
|
|
|
199 |
$t = substr($obj, $p + strlen($regs[1]));
|
|
|
200 |
$stream = "";
|
|
|
201 |
$in_stream = true;
|
|
|
202 |
|
|
|
203 |
$obj_text = substr($obj, 0, $p + strlen($regs[1]));
|
|
|
204 |
|
|
|
205 |
// Parasites avant et apres
|
|
|
206 |
//$obj_text = preg_replace("/^\s+obj\s+/", "", $obj_text);
|
|
|
207 |
//$obj_text = preg_replace("/(\s+endobj)\s+.*$/", "\\1", $obj_text);
|
|
|
208 |
|
|
|
209 |
// Commentaires
|
|
|
210 |
$obj_text = preg_replace("/\\\\%/", ' ', $obj_text);
|
|
|
211 |
$obj_text = preg_replace("/%[^\r\n]*[\r\n]+/", '', $obj_text);
|
|
|
212 |
|
|
|
213 |
// Dictionnaire
|
|
|
214 |
$obj_dict = "";
|
|
|
215 |
//if (ereg("<<(.*)>>", $obj_text, $regs))
|
|
|
216 |
if (preg_match("/<<(.*)>>/s", $obj_text, $regs)) // bug ?!
|
|
|
217 |
$obj_dict = $regs[1];
|
|
|
218 |
|
|
|
219 |
# echo "<hr>";
|
|
|
220 |
# echo "Objet numéro $i<p>";
|
|
|
221 |
# echo "<pre>".htmlspecialchars($obj_text)."</pre>";
|
|
|
222 |
}
|
|
|
223 |
else {
|
|
|
224 |
$t = " endobj ".$obj; // approximation
|
|
|
225 |
}
|
|
|
226 |
unset($obj);
|
|
|
227 |
|
|
|
228 |
// Recoller les morceaux du stream (au cas ou un "obj" se trouvait en clair dans un stream)
|
|
|
229 |
if ($in_stream) {
|
|
|
230 |
if (!($p = strpos($t, "endstream")) && !($q = strpos($t, "endobj"))) {
|
|
|
231 |
$stream .= $t;
|
|
|
232 |
# echo "<font color='red'>Stream continué</font><p>";
|
|
|
233 |
continue;
|
|
|
234 |
}
|
|
|
235 |
$in_stream = false;
|
|
|
236 |
if ($p) $stream .= substr($t, 0, $p);
|
|
|
237 |
else $stream .= substr($t, 0, $q);
|
|
|
238 |
unset($t);
|
|
|
239 |
|
|
|
240 |
// Decoder le contenu du stream
|
|
|
241 |
$encoding = '';
|
|
|
242 |
if (preg_match(",/Filter\s*/([A-Za-z]+),", $obj_dict, $regs))
|
|
|
243 |
$encoding = $regs[1];
|
|
|
244 |
switch($encoding) {
|
|
|
245 |
case 'FlateDecode':
|
|
|
246 |
$stream = gzuncompress($stream); // pb avec certains PDFs !?
|
|
|
247 |
break;
|
|
|
248 |
case '':
|
|
|
249 |
break;
|
|
|
250 |
default:
|
|
|
251 |
$stream = '';
|
|
|
252 |
}
|
|
|
253 |
/*if (preg_match("/\(d.marrage:\)/", $stream, $regs)) {
|
|
|
254 |
$fs = fopen("demarrage.txt", "w");
|
|
|
255 |
fwrite($fs, $regs[0]);
|
|
|
256 |
fclose($fs);
|
|
|
257 |
exit;
|
|
|
258 |
}*/
|
|
|
259 |
}
|
|
|
260 |
|
|
|
261 |
if (!$stream) continue;
|
|
|
262 |
|
|
|
263 |
# echo "Stream : ".strlen($stream)." octets<p>";
|
|
|
264 |
|
|
|
265 |
// Eviter les fontes embarquees, etc.
|
|
|
266 |
if (preg_match(',^%!,', $stream)) {
|
|
|
267 |
unset($stream);
|
|
|
268 |
continue;
|
|
|
269 |
}
|
|
|
270 |
// Detection texte / binaire
|
|
|
271 |
$stream = substr($stream, 0, $stream_len);
|
|
|
272 |
$stream = str_replace('\\(', ",", $stream);
|
|
|
273 |
$stream = str_replace('\\)', ",", $stream);
|
|
|
274 |
$n1 = substr_count($stream, '(');
|
|
|
275 |
$n2 = substr_count($stream, ')');
|
|
|
276 |
$freq = (substr_count($stream, ' ') + $n1 + $n2) / strlen($stream);
|
|
|
277 |
if ($freq < 0.04 || (!$n1 && !$n2)) {
|
|
|
278 |
# echo "no text (1)<p>";
|
|
|
279 |
//echo htmlspecialchars($stream);
|
|
|
280 |
unset($stream);
|
|
|
281 |
continue;
|
|
|
282 |
}
|
|
|
283 |
$dev = abs($n1 - $n2) / ($n1 + $n2);
|
|
|
284 |
if ($dev > 0.05) {
|
|
|
285 |
# echo "no text (2)<p>";
|
|
|
286 |
unset($stream);
|
|
|
287 |
continue;
|
|
|
288 |
}
|
|
|
289 |
// Extraction des chaines
|
|
|
290 |
if (strpos($stream, '<<') && strpos($stream, '>>'))
|
|
|
291 |
$stream = preg_replace(',<<.*?'.'>>,s', '', $stream); // bug avec preg
|
|
|
292 |
$stream = substr($stream, strpos($stream, '(') + 1);
|
|
|
293 |
$stream = substr($stream, 0, strrpos($stream, ')')); // ici un bug occasionnel...
|
|
|
294 |
$stream = $this->convertir_caracteres($stream);
|
|
|
295 |
$extrait = $this->recoller_texte($stream);
|
|
|
296 |
unset($stream);
|
|
|
297 |
$texte .= $extrait;
|
|
|
298 |
|
|
|
299 |
// Se limiter a une certaine taille de texte en sortie
|
|
|
300 |
if (strlen($texte) > $texte_len) {
|
|
|
301 |
$texte = substr($texte, 0, strrpos(substr($texte, 0, $texte_len), ' '));
|
|
|
302 |
break;
|
|
|
303 |
}
|
|
|
304 |
}
|
|
|
305 |
|
|
|
306 |
fclose($f);
|
|
|
307 |
|
|
|
308 |
return $texte;
|
|
|
309 |
}
|
|
|
310 |
|
|
|
311 |
} // class
|
|
|
312 |
|
|
|
313 |
|
|
|
314 |
?>
|