Subversion Repositories Applications.bazar

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
468 mathias 1
<?php
2
/**
3
 *
4
 * @author Olivier Laviale
5
 * @see http://www.weirdog.com/blog/php/un-parser-html-des-plus-leger.html
6
 *
7
 */
8
class WdHTMLParser {
9
    private $encoding;
10
    private $matches;
11
    private $escaped;
12
    private $opened = array();
13
 
14
    public $malformed;
15
 
16
    public function parse($html, $namespace=NULL, $encoding='utf-8') {
17
        $this->malformed = false;
18
        $this->encoding = $encoding;
19
 
20
        // we take care of escaping comments and processing options. they will not be parsed
21
        // and will end as text nodes
22
        $html = $this->escapeSpecials($html);
23
 
24
        // in order to create a tree, we first need to split the HTML using the markups,
25
        // creating a nice flat array of texts and opening and closing markups.
26
        //
27
        // the array can be read as follows :
28
        //
29
        // i+0 => some text
30
        // i+1 => '/' for closing markups, nothing otherwise
31
        // i+2 => the markup it self, without the '<' '>'
32
        //
33
        // note that i+2 might end with a '/' indicating an auto-closing markup
34
        $this->matches = preg_split('#<(/?)' . $namespace . '([^>]*)>#', $html, -1, PREG_SPLIT_DELIM_CAPTURE);
35
 
36
        // the flat representation is now ready, we can create our tree
37
        $tree = $this->buildTree();
38
 
39
        // if comments or processing options where escaped, we can
40
        // safely unescape them now
41
        if ($this->escaped) {
42
            $tree = $this->unescapeSpecials($tree);
43
        }
44
 
45
        return $tree;
46
    }
47
 
48
    private function escapeSpecials($html) {
49
        // here we escape comments
50
        $html = preg_replace_callback('#<\!--.+-->#sU', array($this, 'escapeSpecials_callback'), $html);
51
 
52
        // and processing options
53
        $html = preg_replace_callback('#<\?.+\?>#sU', array($this, 'escapeSpecials_callback'), $html);
54
 
55
        return $html;
56
    }
57
 
58
    private function escapeSpecials_callback($m) {
59
        $this->escaped = true;
60
        $text = $m[0];
61
        $text = str_replace(array('<', '>'), array("\x01", "\x02"), $text);
62
        return $text;
63
    }
64
 
65
    private function unescapeSpecials($tree) {
66
        return is_array($tree) ? array_map(array($this, 'unescapeSpecials'), $tree) : str_replace(array("\x01", "\x02"), array('<', '>'), $tree);
67
    }
68
 
69
    private function buildTree() {
70
        $nodes = array();
71
 
72
        $i = 0;
73
        $text = NULL;
74
        while (($value = array_shift($this->matches)) !== NULL) {
75
            switch ($i++ % 3) {
76
                case 0:
77
                    // if the trimed value is not empty we preserve the value,
78
                    // otherwise we discard it.
79
 
80
                    if (trim($value)){
81
                        $nodes[] = $value;
82
                    }
83
	                break;
84
                case 1:
85
                    $closing = ($value == '/');
86
	                break;
87
                case 2:
88
                    if (substr($value, -1, 1) == '/') {
89
                        // auto closing
90
                        $nodes[] = $this->parseMarkup(substr($value, 0, -1));
91
                    } else if ($closing) {
92
                        // closing markup
93
                        $open = array_pop($this->opened);
94
 
95
                        if ($value != $open) {
96
                            $this->error($value, $open);
97
                        }
98
 
99
                        return $nodes;
100
                    } else {
101
                        // this is an open markup with possible children
102
                        $node = $this->parseMarkup($value);
103
 
104
                        // push the markup name into the opened markups
105
                        $this->opened[] = $node['name'];
106
 
107
                        // create the node and parse its children
108
                        $node['children'] = $this->buildTree($this->matches);
109
 
110
                        $nodes[] = $node;
111
                    }
112
                    break;
113
            }
114
		}
115
 
116
        return $nodes;
117
    }
118
 
119
    public function parseMarkup($markup) {
120
        // get markup's name
121
        preg_match('#^[^\s]+#', $markup, $matches);
122
 
123
        $name = $matches[0];
124
 
125
        // get markup's arguments
126
        preg_match_all('#\s+([^=]+)\s*=\s*"([^"]+)"#', $markup, $matches, PREG_SET_ORDER);
127
 
128
        // transform the matches into a nice key/value array
129
        $args = array();
130
        foreach ($matches as $m) {
131
			// we unescape the html entities of the argument's value
132
            $args[$m[1]] = html_entity_decode($m[2], ENT_QUOTES, $this->encoding);
133
        }
134
 
135
        return array('name' => $name, 'args' => $args);
136
    }
137
 
138
    public function error($markup, $expected) {
139
        $this->malformed = true;
140
        printf('unexpected closing markup "%s", should be "%s"', $markup, $expected);
141
    }
142
}
143
 
144
?>