Rev 418 | Blame | Last modification | View Log | RSS feed
<?php
/**
*
* @author Olivier Laviale
* @see http://www.weirdog.com/blog/php/un-parser-html-des-plus-leger.html
*
*/
class WdHTMLParser {
private $encoding;
private $matches;
private $escaped;
private $opened = array();
public $malformed;
public function parse($html, $namespace=NULL, $encoding='utf-8') {
$this->malformed = false;
$this->encoding = $encoding;
// we take care of escaping comments and processing options. they will not be parsed
// and will end as text nodes
$html = $this->escapeSpecials($html);
// in order to create a tree, we first need to split the HTML using the markups,
// creating a nice flat array of texts and opening and closing markups.
//
// the array can be read as follows :
//
// i+0 => some text
// i+1 => '/' for closing markups, nothing otherwise
// i+2 => the markup it self, without the '<' '>'
//
// note that i+2 might end with a '/' indicating an auto-closing markup
$this->matches = preg_split('#<(/?)' . $namespace . '([^>]*)>#', $html, -1, PREG_SPLIT_DELIM_CAPTURE);
// the flat representation is now ready, we can create our tree
$tree = $this->buildTree();
// if comments or processing options where escaped, we can
// safely unescape them now
if ($this->escaped) {
$tree = $this->unescapeSpecials($tree);
}
return $tree;
}
private function escapeSpecials($html) {
// here we escape comments
$html = preg_replace_callback('#<\!--.+-->#sU', array($this, 'escapeSpecials_callback'), $html);
// and processing options
$html = preg_replace_callback('#<\?.+\?>#sU', array($this, 'escapeSpecials_callback'), $html);
return $html;
}
private function escapeSpecials_callback($m) {
$this->escaped = true;
$text = $m[0];
$text = str_replace(array('<', '>'), array("\x01", "\x02"), $text);
return $text;
}
private function unescapeSpecials($tree) {
return is_array($tree) ? array_map(array($this, 'unescapeSpecials'), $tree) : str_replace(array("\x01", "\x02"), array('<', '>'), $tree);
}
private function buildTree() {
$nodes = array();
$i = 0;
$text = NULL;
while (($value = array_shift($this->matches)) !== NULL) {
switch ($i++ % 3) {
case 0:
// if the trimed value is not empty we preserve the value,
// otherwise we discard it.
if (trim($value)){
$nodes[] = $value;
}
break;
case 1:
$closing = ($value == '/');
break;
case 2:
if (substr($value, -1, 1) == '/') {
// auto closing
$nodes[] = $this->parseMarkup(substr($value, 0, -1));
} else if ($closing) {
// closing markup
$open = array_pop($this->opened);
if ($value != $open) {
$this->error($value, $open);
}
return $nodes;
} else {
// this is an open markup with possible children
$node = $this->parseMarkup($value);
// push the markup name into the opened markups
$this->opened[] = $node['name'];
// create the node and parse its children
$node['children'] = $this->buildTree($this->matches);
$nodes[] = $node;
}
break;
}
}
return $nodes;
}
public function parseMarkup($markup) {
// get markup's name
preg_match('#^[^\s]+#', $markup, $matches);
$name = $matches[0];
// get markup's arguments
preg_match_all('#\s+([^=]+)\s*=\s*"([^"]+)"#', $markup, $matches, PREG_SET_ORDER);
// transform the matches into a nice key/value array
$args = array();
foreach ($matches as $m) {
// we unescape the html entities of the argument's value
$args[$m[1]] = html_entity_decode($m[2], ENT_QUOTES, $this->encoding);
}
return array('name' => $name, 'args' => $args);
}
public function error($markup, $expected) {
$this->malformed = true;
printf('unexpected closing markup "%s", should be "%s"', $markup, $expected);
}
}
?>