WebSVN – eFlore/Applications.coel – Blame – //branches/v1.6-muscardin/jrest/services/bibliotheque/WdHTMLParser.php

Rev	Author	Line No.	Line
1547	jpm	1	`<?php`
		2	`/**`
		3	`*`
		4	`* @author Olivier Laviale`
		5	`* @see http://www.weirdog.com/blog/php/un-parser-html-des-plus-leger.html`
		6	`*`
		7	`*/`
		8	`class WdHTMLParser {`
		9	`private $encoding;`
		10	`private $matches;`
		11	`private $escaped;`
		12	`private $opened = array();`
		13
		14	`public $malformed;`
		15
		16	`public function parse($html, $namespace=NULL, $encoding='utf-8') {`
		17	`$this->malformed = false;`
		18	`$this->encoding = $encoding;`
		19
		20	`// we take care of escaping comments and processing options. they will not be parsed`
		21	`// and will end as text nodes`
		22	`$html = $this->escapeSpecials($html);`
		23
		24	`// in order to create a tree, we first need to split the HTML using the markups,`
		25	`// creating a nice flat array of texts and opening and closing markups.`
		26	`//`
		27	`// the array can be read as follows :`
		28	`//`
		29	`// i+0 => some text`
		30	`// i+1 => '/' for closing markups, nothing otherwise`
		31	`// i+2 => the markup it self, without the '<' '>'`
		32	`//`
		33	`// note that i+2 might end with a '/' indicating an auto-closing markup`
		34	`$this->matches = preg_split('#<(/?)' . $namespace . '([^>]*)>#', $html, -1, PREG_SPLIT_DELIM_CAPTURE);`
		35
		36	`// the flat representation is now ready, we can create our tree`
		37	`$tree = $this->buildTree();`
		38
		39	`// if comments or processing options where escaped, we can`
		40	`// safely unescape them now`
		41	`if ($this->escaped) {`
		42	`$tree = $this->unescapeSpecials($tree);`
		43	`}`
		44
		45	`return $tree;`
		46	`}`
		47
		48	`private function escapeSpecials($html) {`
		49	`// here we escape comments`
		50	`$html = preg_replace_callback('#<\!--.+-->#sU', array($this, 'escapeSpecials_callback'), $html);`
		51
		52	`// and processing options`
		53	`$html = preg_replace_callback('#<\?.+\?>#sU', array($this, 'escapeSpecials_callback'), $html);`
		54
		55	`return $html;`
		56	`}`
		57
		58	`private function escapeSpecials_callback($m) {`
		59	`$this->escaped = true;`
		60	`$text = $m[0];`
		61	`$text = str_replace(array('<', '>'), array("\x01", "\x02"), $text);`
		62	`return $text;`
		63	`}`
		64
		65	`private function unescapeSpecials($tree) {`
		66	`return is_array($tree) ? array_map(array($this, 'unescapeSpecials'), $tree) : str_replace(array("\x01", "\x02"), array('<', '>'), $tree);`
		67	`}`
		68
		69	`private function buildTree() {`
		70	`$nodes = array();`
		71
		72	`$i = 0;`
		73	`$text = NULL;`
		74	`while (($value = array_shift($this->matches)) !== NULL) {`
		75	`switch ($i++ % 3) {`
		76	`case 0:`
		77	`// if the trimed value is not empty we preserve the value,`
		78	`// otherwise we discard it.`
		79
		80	`if (trim($value)){`
		81	`$nodes[] = $value;`
		82	`}`
		83	`break;`
		84	`case 1:`
		85	`$closing = ($value == '/');`
		86	`break;`
		87	`case 2:`
		88	`if (substr($value, -1, 1) == '/') {`
		89	`// auto closing`
		90	`$nodes[] = $this->parseMarkup(substr($value, 0, -1));`
		91	`} else if ($closing) {`
		92	`// closing markup`
		93	`$open = array_pop($this->opened);`
		94
		95	`if ($value != $open) {`
		96	`$this->error($value, $open);`
		97	`}`
		98
		99	`return $nodes;`
		100	`} else {`
		101	`// this is an open markup with possible children`
		102	`$node = $this->parseMarkup($value);`
		103
		104	`// push the markup name into the opened markups`
		105	`$this->opened[] = $node['name'];`
		106
		107	`// create the node and parse its children`
		108	`$node['children'] = $this->buildTree($this->matches);`
		109
		110	`$nodes[] = $node;`
		111	`}`
		112	`break;`
		113	`}`
		114	`}`
		115
		116	`return $nodes;`
		117	`}`
		118
		119	`public function parseMarkup($markup) {`
		120	`// get markup's name`
		121	`preg_match('#^[^\s]+#', $markup, $matches);`
		122
		123	`$name = $matches[0];`
		124
		125	`// get markup's arguments`
		126	`preg_match_all('#\s+([^=]+)\s=\s"([^"]+)"#', $markup, $matches, PREG_SET_ORDER);`
		127
		128	`// transform the matches into a nice key/value array`
		129	`$args = array();`
		130	`foreach ($matches as $m) {`
		131	`// we unescape the html entities of the argument's value`
		132	`$args[$m[1]] = html_entity_decode($m[2], ENT_QUOTES, $this->encoding);`
		133	`}`
		134
		135	`return array('name' => $name, 'args' => $args);`
		136	`}`
		137
		138	`public function error($markup, $expected) {`
		139	`$this->malformed = true;`
		140	`printf('unexpected closing markup "%s", should be "%s"', $markup, $expected);`
		141	`}`
		142	`}`
		143
		144	`?>`

Subversion Repositories eFlore/Applications.coel

(root)//branches/v1.6-muscardin/jrest/services/bibliotheque/WdHTMLParser.php – Rev 1547