Subversion Repositories Applications.annuaire

Rev

Blame | Last modification | View Log | RSS feed

<?php
/**
 * 
 * @author Olivier Laviale
 * @see http://www.weirdog.com/blog/php/un-parser-html-des-plus-leger.html
 *
 */
class WdHTMLParser {
    private $encoding;
    private $matches;
    private $escaped;
    private $opened = array();
    
    public $malformed;

    public function parse($html, $namespace=NULL, $encoding='utf-8') {
        $this->malformed = false;
        $this->encoding = $encoding;
        
        // we take care of escaping comments and processing options. they will not be parsed
        // and will end as text nodes
        $html = $this->escapeSpecials($html);
        
        // in order to create a tree, we first need to split the HTML using the markups,
        // creating a nice flat array of texts and opening and closing markups.
        //
        // the array can be read as follows :
        //
        // i+0 => some text
        // i+1 => '/' for closing markups, nothing otherwise
        // i+2 => the markup it self, without the '<' '>'
        //
        // note that i+2 might end with a '/' indicating an auto-closing markup
        $this->matches = preg_split('#<(/?)' . $namespace . '([^>]*)>#', $html, -1, PREG_SPLIT_DELIM_CAPTURE);
        
        // the flat representation is now ready, we can create our tree
        $tree = $this->buildTree();

        // if comments or processing options where escaped, we can
        // safely unescape them now
        if ($this->escaped) {
            $tree = $this->unescapeSpecials($tree);
        }
        
        return $tree;
    }
    
    private function escapeSpecials($html) {
        // here we escape comments
        $html = preg_replace_callback('#<\!--.+-->#sU', array($this, 'escapeSpecials_callback'), $html);

        // and processing options
        $html = preg_replace_callback('#<\?.+\?>#sU', array($this, 'escapeSpecials_callback'), $html);
        
        return $html;
    }
    
    private function escapeSpecials_callback($m) {
        $this->escaped = true;
        $text = $m[0];
        $text = str_replace(array('<', '>'), array("\x01", "\x02"), $text);
        return $text;
    }

    private function unescapeSpecials($tree) {
        return is_array($tree) ? array_map(array($this, 'unescapeSpecials'), $tree) : str_replace(array("\x01", "\x02"), array('<', '>'), $tree);
    }

    private function buildTree() {
        $nodes = array();
            
        $i = 0;
        $text = NULL;
        while (($value = array_shift($this->matches)) !== NULL) {
            switch ($i++ % 3) {
                case 0:
                    // if the trimed value is not empty we preserve the value,
                    // otherwise we discard it.
                    
                    if (trim($value)){
                        $nodes[] = $value;
                    }
                        break;
                case 1:
                    $closing = ($value == '/');
                        break;            
                case 2:
                    if (substr($value, -1, 1) == '/') {
                        // auto closing
                        $nodes[] = $this->parseMarkup(substr($value, 0, -1));
                    } else if ($closing) {
                        // closing markup
                        $open = array_pop($this->opened);
                    
                        if ($value != $open) {
                            $this->error($value, $open);
                        }

                        return $nodes;
                    } else {
                        // this is an open markup with possible children
                        $node = $this->parseMarkup($value);
                        
                        // push the markup name into the opened markups
                        $this->opened[] = $node['name'];
                        
                        // create the node and parse its children
                        $node['children'] = $this->buildTree($this->matches);
                        
                        $nodes[] = $node;
                    }
                    break;
            }
                }
        
        return $nodes;
    }
    
    public function parseMarkup($markup) {
        // get markup's name
        preg_match('#^[^\s]+#', $markup, $matches);
        
        $name = $matches[0];

        // get markup's arguments
        preg_match_all('#\s+([^=]+)\s*=\s*"([^"]+)"#', $markup, $matches, PREG_SET_ORDER);
        
        // transform the matches into a nice key/value array
        $args = array();
        foreach ($matches as $m) {
                        // we unescape the html entities of the argument's value
            $args[$m[1]] = html_entity_decode($m[2], ENT_QUOTES, $this->encoding);
        }

        return array('name' => $name, 'args' => $args);
    }
    
    public function error($markup, $expected) {
        $this->malformed = true;
        printf('unexpected closing markup "%s", should be "%s"', $markup, $expected);
    }
}

?>