42 |
aurelien |
1 |
<?php
|
|
|
2 |
|
|
|
3 |
/**
|
|
|
4 |
* This module implements a VERY limited parser that finds <link> tags
|
|
|
5 |
* in the head of HTML or XHTML documents and parses out their
|
|
|
6 |
* attributes according to the OpenID spec. It is a liberal parser,
|
|
|
7 |
* but it requires these things from the data in order to work:
|
|
|
8 |
*
|
|
|
9 |
* - There must be an open <html> tag
|
|
|
10 |
*
|
|
|
11 |
* - There must be an open <head> tag inside of the <html> tag
|
|
|
12 |
*
|
|
|
13 |
* - Only <link>s that are found inside of the <head> tag are parsed
|
|
|
14 |
* (this is by design)
|
|
|
15 |
*
|
|
|
16 |
* - The parser follows the OpenID specification in resolving the
|
|
|
17 |
* attributes of the link tags. This means that the attributes DO
|
|
|
18 |
* NOT get resolved as they would by an XML or HTML parser. In
|
|
|
19 |
* particular, only certain entities get replaced, and href
|
|
|
20 |
* attributes do not get resolved relative to a base URL.
|
|
|
21 |
*
|
|
|
22 |
* From http://openid.net/specs.bml:
|
|
|
23 |
*
|
|
|
24 |
* - The openid.server URL MUST be an absolute URL. OpenID consumers
|
|
|
25 |
* MUST NOT attempt to resolve relative URLs.
|
|
|
26 |
*
|
|
|
27 |
* - The openid.server URL MUST NOT include entities other than &,
|
|
|
28 |
* <, >, and ".
|
|
|
29 |
*
|
|
|
30 |
* The parser ignores SGML comments and <![CDATA[blocks]]>. Both kinds
|
|
|
31 |
* of quoting are allowed for attributes.
|
|
|
32 |
*
|
|
|
33 |
* The parser deals with invalid markup in these ways:
|
|
|
34 |
*
|
|
|
35 |
* - Tag names are not case-sensitive
|
|
|
36 |
*
|
|
|
37 |
* - The <html> tag is accepted even when it is not at the top level
|
|
|
38 |
*
|
|
|
39 |
* - The <head> tag is accepted even when it is not a direct child of
|
|
|
40 |
* the <html> tag, but a <html> tag must be an ancestor of the
|
|
|
41 |
* <head> tag
|
|
|
42 |
*
|
|
|
43 |
* - <link> tags are accepted even when they are not direct children
|
|
|
44 |
* of the <head> tag, but a <head> tag must be an ancestor of the
|
|
|
45 |
* <link> tag
|
|
|
46 |
*
|
|
|
47 |
* - If there is no closing tag for an open <html> or <head> tag, the
|
|
|
48 |
* remainder of the document is viewed as being inside of the
|
|
|
49 |
* tag. If there is no closing tag for a <link> tag, the link tag is
|
|
|
50 |
* treated as a short tag. Exceptions to this rule are that <html>
|
|
|
51 |
* closes <html> and <body> or <head> closes <head>
|
|
|
52 |
*
|
|
|
53 |
* - Attributes of the <link> tag are not required to be quoted.
|
|
|
54 |
*
|
|
|
55 |
* - In the case of duplicated attribute names, the attribute coming
|
|
|
56 |
* last in the tag will be the value returned.
|
|
|
57 |
*
|
|
|
58 |
* - Any text that does not parse as an attribute within a link tag
|
|
|
59 |
* will be ignored. (e.g. <link pumpkin rel='openid.server' /> will
|
|
|
60 |
* ignore pumpkin)
|
|
|
61 |
*
|
|
|
62 |
* - If there are more than one <html> or <head> tag, the parser only
|
|
|
63 |
* looks inside of the first one.
|
|
|
64 |
*
|
|
|
65 |
* - The contents of <script> tags are ignored entirely, except
|
|
|
66 |
* unclosed <script> tags. Unclosed <script> tags are ignored.
|
|
|
67 |
*
|
|
|
68 |
* - Any other invalid markup is ignored, including unclosed SGML
|
|
|
69 |
* comments and unclosed <![CDATA[blocks.
|
|
|
70 |
*
|
|
|
71 |
* PHP versions 4 and 5
|
|
|
72 |
*
|
|
|
73 |
* LICENSE: See the COPYING file included in this distribution.
|
|
|
74 |
*
|
|
|
75 |
* @access private
|
|
|
76 |
* @package OpenID
|
|
|
77 |
* @author JanRain, Inc. <openid@janrain.com>
|
|
|
78 |
* @copyright 2005 Janrain, Inc.
|
|
|
79 |
* @license http://www.gnu.org/copyleft/lesser.html LGPL
|
|
|
80 |
*/
|
|
|
81 |
|
|
|
82 |
/**
|
|
|
83 |
* Require Auth_OpenID::arrayGet().
|
|
|
84 |
*/
|
|
|
85 |
require_once "Auth/OpenID.php";
|
|
|
86 |
|
|
|
87 |
class Auth_OpenID_Parse {
|
|
|
88 |
|
|
|
89 |
/**
|
|
|
90 |
* Specify some flags for use with regex matching.
|
|
|
91 |
*/
|
|
|
92 |
var $_re_flags = "si";
|
|
|
93 |
|
|
|
94 |
/**
|
|
|
95 |
* Stuff to remove before we start looking for tags
|
|
|
96 |
*/
|
|
|
97 |
var $_removed_re =
|
|
|
98 |
"<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>";
|
|
|
99 |
|
|
|
100 |
/**
|
|
|
101 |
* Starts with the tag name at a word boundary, where the tag name
|
|
|
102 |
* is not a namespace
|
|
|
103 |
*/
|
|
|
104 |
var $_tag_expr = "<%s\b(?!:)([^>]*?)(?:\/>|>(.*?)(?:<\/?%s\s*>|\Z))";
|
|
|
105 |
|
|
|
106 |
var $_attr_find = '\b(\w+)=("[^"]*"|\'[^\']*\'|[^\'"\s\/<>]+)';
|
|
|
107 |
|
|
|
108 |
function Auth_OpenID_Parse()
|
|
|
109 |
{
|
|
|
110 |
$this->_link_find = sprintf("/<link\b(?!:)([^>]*)(?!<)>/%s",
|
|
|
111 |
$this->_re_flags);
|
|
|
112 |
|
|
|
113 |
$this->_entity_replacements = array(
|
|
|
114 |
'amp' => '&',
|
|
|
115 |
'lt' => '<',
|
|
|
116 |
'gt' => '>',
|
|
|
117 |
'quot' => '"'
|
|
|
118 |
);
|
|
|
119 |
|
|
|
120 |
$this->_attr_find = sprintf("/%s/%s",
|
|
|
121 |
$this->_attr_find,
|
|
|
122 |
$this->_re_flags);
|
|
|
123 |
|
|
|
124 |
$this->_removed_re = sprintf("/%s/%s",
|
|
|
125 |
$this->_removed_re,
|
|
|
126 |
$this->_re_flags);
|
|
|
127 |
|
|
|
128 |
$this->_ent_replace =
|
|
|
129 |
sprintf("&(%s);", implode("|",
|
|
|
130 |
$this->_entity_replacements));
|
|
|
131 |
}
|
|
|
132 |
|
|
|
133 |
/**
|
|
|
134 |
* Returns a regular expression that will match a given tag in an
|
|
|
135 |
* SGML string.
|
|
|
136 |
*/
|
|
|
137 |
function tagMatcher($tag_name, $close_tags = null)
|
|
|
138 |
{
|
|
|
139 |
if ($close_tags) {
|
|
|
140 |
$options = implode("|", array_merge(array($tag_name), $close_tags));
|
|
|
141 |
$closer = sprintf("(?:%s)", $options);
|
|
|
142 |
} else {
|
|
|
143 |
$closer = $tag_name;
|
|
|
144 |
}
|
|
|
145 |
|
|
|
146 |
$expr = sprintf($this->_tag_expr, $tag_name, $closer);
|
|
|
147 |
return sprintf("/%s/%s", $expr, $this->_re_flags);
|
|
|
148 |
}
|
|
|
149 |
|
|
|
150 |
function htmlFind()
|
|
|
151 |
{
|
|
|
152 |
return $this->tagMatcher('html');
|
|
|
153 |
}
|
|
|
154 |
|
|
|
155 |
function headFind()
|
|
|
156 |
{
|
|
|
157 |
return $this->tagMatcher('head', array('body'));
|
|
|
158 |
}
|
|
|
159 |
|
|
|
160 |
function replaceEntities($str)
|
|
|
161 |
{
|
|
|
162 |
foreach ($this->_entity_replacements as $old => $new) {
|
|
|
163 |
$str = preg_replace(sprintf("/&%s;/", $old), $new, $str);
|
|
|
164 |
}
|
|
|
165 |
return $str;
|
|
|
166 |
}
|
|
|
167 |
|
|
|
168 |
function removeQuotes($str)
|
|
|
169 |
{
|
|
|
170 |
$matches = array();
|
|
|
171 |
$double = '/^"(.*)"$/';
|
|
|
172 |
$single = "/^\'(.*)\'$/";
|
|
|
173 |
|
|
|
174 |
if (preg_match($double, $str, $matches)) {
|
|
|
175 |
return $matches[1];
|
|
|
176 |
} else if (preg_match($single, $str, $matches)) {
|
|
|
177 |
return $matches[1];
|
|
|
178 |
} else {
|
|
|
179 |
return $str;
|
|
|
180 |
}
|
|
|
181 |
}
|
|
|
182 |
|
|
|
183 |
/**
|
|
|
184 |
* Find all link tags in a string representing a HTML document and
|
|
|
185 |
* return a list of their attributes.
|
|
|
186 |
*
|
|
|
187 |
* @param string $html The text to parse
|
|
|
188 |
* @return array $list An array of arrays of attributes, one for each
|
|
|
189 |
* link tag
|
|
|
190 |
*/
|
|
|
191 |
function parseLinkAttrs($html)
|
|
|
192 |
{
|
|
|
193 |
$stripped = preg_replace($this->_removed_re,
|
|
|
194 |
"",
|
|
|
195 |
$html);
|
|
|
196 |
|
|
|
197 |
// Try to find the <HTML> tag.
|
|
|
198 |
$html_re = $this->htmlFind();
|
|
|
199 |
$html_matches = array();
|
|
|
200 |
if (!preg_match($html_re, $stripped, $html_matches)) {
|
|
|
201 |
return array();
|
|
|
202 |
}
|
|
|
203 |
|
|
|
204 |
// Try to find the <HEAD> tag.
|
|
|
205 |
$head_re = $this->headFind();
|
|
|
206 |
$head_matches = array();
|
|
|
207 |
if (!preg_match($head_re, $html_matches[0], $head_matches)) {
|
|
|
208 |
return array();
|
|
|
209 |
}
|
|
|
210 |
|
|
|
211 |
$link_data = array();
|
|
|
212 |
$link_matches = array();
|
|
|
213 |
|
|
|
214 |
if (!preg_match_all($this->_link_find, $head_matches[0],
|
|
|
215 |
$link_matches)) {
|
|
|
216 |
return array();
|
|
|
217 |
}
|
|
|
218 |
|
|
|
219 |
foreach ($link_matches[0] as $link) {
|
|
|
220 |
$attr_matches = array();
|
|
|
221 |
preg_match_all($this->_attr_find, $link, $attr_matches);
|
|
|
222 |
$link_attrs = array();
|
|
|
223 |
foreach ($attr_matches[0] as $index => $full_match) {
|
|
|
224 |
$name = $attr_matches[1][$index];
|
|
|
225 |
$value = $this->replaceEntities(
|
|
|
226 |
$this->removeQuotes($attr_matches[2][$index]));
|
|
|
227 |
|
|
|
228 |
$link_attrs[strtolower($name)] = $value;
|
|
|
229 |
}
|
|
|
230 |
$link_data[] = $link_attrs;
|
|
|
231 |
}
|
|
|
232 |
|
|
|
233 |
return $link_data;
|
|
|
234 |
}
|
|
|
235 |
|
|
|
236 |
function relMatches($rel_attr, $target_rel)
|
|
|
237 |
{
|
|
|
238 |
// Does this target_rel appear in the rel_str?
|
|
|
239 |
// XXX: TESTME
|
|
|
240 |
$rels = preg_split("/\s+/", trim($rel_attr));
|
|
|
241 |
foreach ($rels as $rel) {
|
|
|
242 |
$rel = strtolower($rel);
|
|
|
243 |
if ($rel == $target_rel) {
|
|
|
244 |
return 1;
|
|
|
245 |
}
|
|
|
246 |
}
|
|
|
247 |
|
|
|
248 |
return 0;
|
|
|
249 |
}
|
|
|
250 |
|
|
|
251 |
function linkHasRel($link_attrs, $target_rel)
|
|
|
252 |
{
|
|
|
253 |
// Does this link have target_rel as a relationship?
|
|
|
254 |
// XXX: TESTME
|
|
|
255 |
$rel_attr = Auth_OpeniD::arrayGet($link_attrs, 'rel', null);
|
|
|
256 |
return ($rel_attr && $this->relMatches($rel_attr,
|
|
|
257 |
$target_rel));
|
|
|
258 |
}
|
|
|
259 |
|
|
|
260 |
function findLinksRel($link_attrs_list, $target_rel)
|
|
|
261 |
{
|
|
|
262 |
// Filter the list of link attributes on whether it has
|
|
|
263 |
// target_rel as a relationship.
|
|
|
264 |
// XXX: TESTME
|
|
|
265 |
$result = array();
|
|
|
266 |
foreach ($link_attrs_list as $attr) {
|
|
|
267 |
if ($this->linkHasRel($attr, $target_rel)) {
|
|
|
268 |
$result[] = $attr;
|
|
|
269 |
}
|
|
|
270 |
}
|
|
|
271 |
|
|
|
272 |
return $result;
|
|
|
273 |
}
|
|
|
274 |
|
|
|
275 |
function findFirstHref($link_attrs_list, $target_rel)
|
|
|
276 |
{
|
|
|
277 |
// Return the value of the href attribute for the first link
|
|
|
278 |
// tag in the list that has target_rel as a relationship.
|
|
|
279 |
// XXX: TESTME
|
|
|
280 |
$matches = $this->findLinksRel($link_attrs_list,
|
|
|
281 |
$target_rel);
|
|
|
282 |
if (!$matches) {
|
|
|
283 |
return null;
|
|
|
284 |
}
|
|
|
285 |
$first = $matches[0];
|
|
|
286 |
return Auth_OpenID::arrayGet($first, 'href', null);
|
|
|
287 |
}
|
|
|
288 |
}
|
|
|
289 |
|
|
|
290 |
function Auth_OpenID_legacy_discover($html_text)
|
|
|
291 |
{
|
|
|
292 |
$p = new Auth_OpenID_Parse();
|
|
|
293 |
|
|
|
294 |
$link_attrs = $p->parseLinkAttrs($html_text);
|
|
|
295 |
|
|
|
296 |
$server_url = $p->findFirstHref($link_attrs,
|
|
|
297 |
'openid.server');
|
|
|
298 |
|
|
|
299 |
if ($server_url === null) {
|
|
|
300 |
return false;
|
|
|
301 |
} else {
|
|
|
302 |
$delegate_url = $p->findFirstHref($link_attrs,
|
|
|
303 |
'openid.delegate');
|
|
|
304 |
return array($delegate_url, $server_url);
|
|
|
305 |
}
|
|
|
306 |
}
|
|
|
307 |
|
|
|
308 |
?>
|