New file |
0,0 → 1,144 |
<?php |
/** |
* |
* @author Olivier Laviale |
* @see http://www.weirdog.com/blog/php/un-parser-html-des-plus-leger.html |
* |
*/ |
class WdHTMLParser { |
private $encoding; |
private $matches; |
private $escaped; |
private $opened = array(); |
|
public $malformed; |
|
public function parse($html, $namespace=NULL, $encoding='utf-8') { |
$this->malformed = false; |
$this->encoding = $encoding; |
|
// we take care of escaping comments and processing options. they will not be parsed |
// and will end as text nodes |
$html = $this->escapeSpecials($html); |
|
// in order to create a tree, we first need to split the HTML using the markups, |
// creating a nice flat array of texts and opening and closing markups. |
// |
// the array can be read as follows : |
// |
// i+0 => some text |
// i+1 => '/' for closing markups, nothing otherwise |
// i+2 => the markup it self, without the '<' '>' |
// |
// note that i+2 might end with a '/' indicating an auto-closing markup |
$this->matches = preg_split('#<(/?)' . $namespace . '([^>]*)>#', $html, -1, PREG_SPLIT_DELIM_CAPTURE); |
|
// the flat representation is now ready, we can create our tree |
$tree = $this->buildTree(); |
|
// if comments or processing options where escaped, we can |
// safely unescape them now |
if ($this->escaped) { |
$tree = $this->unescapeSpecials($tree); |
} |
|
return $tree; |
} |
|
private function escapeSpecials($html) { |
// here we escape comments |
$html = preg_replace_callback('#<\!--.+-->#sU', array($this, 'escapeSpecials_callback'), $html); |
|
// and processing options |
$html = preg_replace_callback('#<\?.+\?>#sU', array($this, 'escapeSpecials_callback'), $html); |
|
return $html; |
} |
|
private function escapeSpecials_callback($m) { |
$this->escaped = true; |
$text = $m[0]; |
$text = str_replace(array('<', '>'), array("\x01", "\x02"), $text); |
return $text; |
} |
|
private function unescapeSpecials($tree) { |
return is_array($tree) ? array_map(array($this, 'unescapeSpecials'), $tree) : str_replace(array("\x01", "\x02"), array('<', '>'), $tree); |
} |
|
private function buildTree() { |
$nodes = array(); |
|
$i = 0; |
$text = NULL; |
while (($value = array_shift($this->matches)) !== NULL) { |
switch ($i++ % 3) { |
case 0: |
// if the trimed value is not empty we preserve the value, |
// otherwise we discard it. |
|
if (trim($value)){ |
$nodes[] = $value; |
} |
break; |
case 1: |
$closing = ($value == '/'); |
break; |
case 2: |
if (substr($value, -1, 1) == '/') { |
// auto closing |
$nodes[] = $this->parseMarkup(substr($value, 0, -1)); |
} else if ($closing) { |
// closing markup |
$open = array_pop($this->opened); |
|
if ($value != $open) { |
$this->error($value, $open); |
} |
|
return $nodes; |
} else { |
// this is an open markup with possible children |
$node = $this->parseMarkup($value); |
|
// push the markup name into the opened markups |
$this->opened[] = $node['name']; |
|
// create the node and parse its children |
$node['children'] = $this->buildTree($this->matches); |
|
$nodes[] = $node; |
} |
break; |
} |
} |
|
return $nodes; |
} |
|
public function parseMarkup($markup) { |
// get markup's name |
preg_match('#^[^\s]+#', $markup, $matches); |
|
$name = $matches[0]; |
|
// get markup's arguments |
preg_match_all('#\s+([^=]+)\s*=\s*"([^"]+)"#', $markup, $matches, PREG_SET_ORDER); |
|
// transform the matches into a nice key/value array |
$args = array(); |
foreach ($matches as $m) { |
// we unescape the html entities of the argument's value |
$args[$m[1]] = html_entity_decode($m[2], ENT_QUOTES, $this->encoding); |
} |
|
return array('name' => $name, 'args' => $args); |
} |
|
public function error($markup, $expected) { |
$this->malformed = true; |
printf('unexpected closing markup "%s", should be "%s"', $markup, $expected); |
} |
} |
|
?> |