468 |
mathias |
1 |
<?php
|
|
|
2 |
/**
|
|
|
3 |
*
|
|
|
4 |
* @author Olivier Laviale
|
|
|
5 |
* @see http://www.weirdog.com/blog/php/un-parser-html-des-plus-leger.html
|
|
|
6 |
*
|
|
|
7 |
*/
|
|
|
8 |
class WdHTMLParser {
|
|
|
9 |
private $encoding;
|
|
|
10 |
private $matches;
|
|
|
11 |
private $escaped;
|
|
|
12 |
private $opened = array();
|
|
|
13 |
|
|
|
14 |
public $malformed;
|
|
|
15 |
|
|
|
16 |
public function parse($html, $namespace=NULL, $encoding='utf-8') {
|
|
|
17 |
$this->malformed = false;
|
|
|
18 |
$this->encoding = $encoding;
|
|
|
19 |
|
|
|
20 |
// we take care of escaping comments and processing options. they will not be parsed
|
|
|
21 |
// and will end as text nodes
|
|
|
22 |
$html = $this->escapeSpecials($html);
|
|
|
23 |
|
|
|
24 |
// in order to create a tree, we first need to split the HTML using the markups,
|
|
|
25 |
// creating a nice flat array of texts and opening and closing markups.
|
|
|
26 |
//
|
|
|
27 |
// the array can be read as follows :
|
|
|
28 |
//
|
|
|
29 |
// i+0 => some text
|
|
|
30 |
// i+1 => '/' for closing markups, nothing otherwise
|
|
|
31 |
// i+2 => the markup it self, without the '<' '>'
|
|
|
32 |
//
|
|
|
33 |
// note that i+2 might end with a '/' indicating an auto-closing markup
|
|
|
34 |
$this->matches = preg_split('#<(/?)' . $namespace . '([^>]*)>#', $html, -1, PREG_SPLIT_DELIM_CAPTURE);
|
|
|
35 |
|
|
|
36 |
// the flat representation is now ready, we can create our tree
|
|
|
37 |
$tree = $this->buildTree();
|
|
|
38 |
|
|
|
39 |
// if comments or processing options where escaped, we can
|
|
|
40 |
// safely unescape them now
|
|
|
41 |
if ($this->escaped) {
|
|
|
42 |
$tree = $this->unescapeSpecials($tree);
|
|
|
43 |
}
|
|
|
44 |
|
|
|
45 |
return $tree;
|
|
|
46 |
}
|
|
|
47 |
|
|
|
48 |
private function escapeSpecials($html) {
|
|
|
49 |
// here we escape comments
|
|
|
50 |
$html = preg_replace_callback('#<\!--.+-->#sU', array($this, 'escapeSpecials_callback'), $html);
|
|
|
51 |
|
|
|
52 |
// and processing options
|
|
|
53 |
$html = preg_replace_callback('#<\?.+\?>#sU', array($this, 'escapeSpecials_callback'), $html);
|
|
|
54 |
|
|
|
55 |
return $html;
|
|
|
56 |
}
|
|
|
57 |
|
|
|
58 |
private function escapeSpecials_callback($m) {
|
|
|
59 |
$this->escaped = true;
|
|
|
60 |
$text = $m[0];
|
|
|
61 |
$text = str_replace(array('<', '>'), array("\x01", "\x02"), $text);
|
|
|
62 |
return $text;
|
|
|
63 |
}
|
|
|
64 |
|
|
|
65 |
private function unescapeSpecials($tree) {
|
|
|
66 |
return is_array($tree) ? array_map(array($this, 'unescapeSpecials'), $tree) : str_replace(array("\x01", "\x02"), array('<', '>'), $tree);
|
|
|
67 |
}
|
|
|
68 |
|
|
|
69 |
private function buildTree() {
|
|
|
70 |
$nodes = array();
|
|
|
71 |
|
|
|
72 |
$i = 0;
|
|
|
73 |
$text = NULL;
|
|
|
74 |
while (($value = array_shift($this->matches)) !== NULL) {
|
|
|
75 |
switch ($i++ % 3) {
|
|
|
76 |
case 0:
|
|
|
77 |
// if the trimed value is not empty we preserve the value,
|
|
|
78 |
// otherwise we discard it.
|
|
|
79 |
|
|
|
80 |
if (trim($value)){
|
|
|
81 |
$nodes[] = $value;
|
|
|
82 |
}
|
|
|
83 |
break;
|
|
|
84 |
case 1:
|
|
|
85 |
$closing = ($value == '/');
|
|
|
86 |
break;
|
|
|
87 |
case 2:
|
|
|
88 |
if (substr($value, -1, 1) == '/') {
|
|
|
89 |
// auto closing
|
|
|
90 |
$nodes[] = $this->parseMarkup(substr($value, 0, -1));
|
|
|
91 |
} else if ($closing) {
|
|
|
92 |
// closing markup
|
|
|
93 |
$open = array_pop($this->opened);
|
|
|
94 |
|
|
|
95 |
if ($value != $open) {
|
|
|
96 |
$this->error($value, $open);
|
|
|
97 |
}
|
|
|
98 |
|
|
|
99 |
return $nodes;
|
|
|
100 |
} else {
|
|
|
101 |
// this is an open markup with possible children
|
|
|
102 |
$node = $this->parseMarkup($value);
|
|
|
103 |
|
|
|
104 |
// push the markup name into the opened markups
|
|
|
105 |
$this->opened[] = $node['name'];
|
|
|
106 |
|
|
|
107 |
// create the node and parse its children
|
|
|
108 |
$node['children'] = $this->buildTree($this->matches);
|
|
|
109 |
|
|
|
110 |
$nodes[] = $node;
|
|
|
111 |
}
|
|
|
112 |
break;
|
|
|
113 |
}
|
|
|
114 |
}
|
|
|
115 |
|
|
|
116 |
return $nodes;
|
|
|
117 |
}
|
|
|
118 |
|
|
|
119 |
public function parseMarkup($markup) {
|
|
|
120 |
// get markup's name
|
|
|
121 |
preg_match('#^[^\s]+#', $markup, $matches);
|
|
|
122 |
|
|
|
123 |
$name = $matches[0];
|
|
|
124 |
|
|
|
125 |
// get markup's arguments
|
|
|
126 |
preg_match_all('#\s+([^=]+)\s*=\s*"([^"]+)"#', $markup, $matches, PREG_SET_ORDER);
|
|
|
127 |
|
|
|
128 |
// transform the matches into a nice key/value array
|
|
|
129 |
$args = array();
|
|
|
130 |
foreach ($matches as $m) {
|
|
|
131 |
// we unescape the html entities of the argument's value
|
|
|
132 |
$args[$m[1]] = html_entity_decode($m[2], ENT_QUOTES, $this->encoding);
|
|
|
133 |
}
|
|
|
134 |
|
|
|
135 |
return array('name' => $name, 'args' => $args);
|
|
|
136 |
}
|
|
|
137 |
|
|
|
138 |
public function error($markup, $expected) {
|
|
|
139 |
$this->malformed = true;
|
|
|
140 |
printf('unexpected closing markup "%s", should be "%s"', $markup, $expected);
|
|
|
141 |
}
|
|
|
142 |
}
|
|
|
143 |
|
|
|
144 |
?>
|