New file |
0,0 → 1,308 |
<?php |
|
/** |
* This module implements a VERY limited parser that finds <link> tags |
* in the head of HTML or XHTML documents and parses out their |
* attributes according to the OpenID spec. It is a liberal parser, |
* but it requires these things from the data in order to work: |
* |
* - There must be an open <html> tag |
* |
* - There must be an open <head> tag inside of the <html> tag |
* |
* - Only <link>s that are found inside of the <head> tag are parsed |
* (this is by design) |
* |
* - The parser follows the OpenID specification in resolving the |
* attributes of the link tags. This means that the attributes DO |
* NOT get resolved as they would by an XML or HTML parser. In |
* particular, only certain entities get replaced, and href |
* attributes do not get resolved relative to a base URL. |
* |
* From http://openid.net/specs.bml: |
* |
* - The openid.server URL MUST be an absolute URL. OpenID consumers |
* MUST NOT attempt to resolve relative URLs. |
* |
* - The openid.server URL MUST NOT include entities other than &, |
* <, >, and ". |
* |
* The parser ignores SGML comments and <![CDATA[blocks]]>. Both kinds |
* of quoting are allowed for attributes. |
* |
* The parser deals with invalid markup in these ways: |
* |
* - Tag names are not case-sensitive |
* |
* - The <html> tag is accepted even when it is not at the top level |
* |
* - The <head> tag is accepted even when it is not a direct child of |
* the <html> tag, but a <html> tag must be an ancestor of the |
* <head> tag |
* |
* - <link> tags are accepted even when they are not direct children |
* of the <head> tag, but a <head> tag must be an ancestor of the |
* <link> tag |
* |
* - If there is no closing tag for an open <html> or <head> tag, the |
* remainder of the document is viewed as being inside of the |
* tag. If there is no closing tag for a <link> tag, the link tag is |
* treated as a short tag. Exceptions to this rule are that <html> |
* closes <html> and <body> or <head> closes <head> |
* |
* - Attributes of the <link> tag are not required to be quoted. |
* |
* - In the case of duplicated attribute names, the attribute coming |
* last in the tag will be the value returned. |
* |
* - Any text that does not parse as an attribute within a link tag |
* will be ignored. (e.g. <link pumpkin rel='openid.server' /> will |
* ignore pumpkin) |
* |
* - If there are more than one <html> or <head> tag, the parser only |
* looks inside of the first one. |
* |
* - The contents of <script> tags are ignored entirely, except |
* unclosed <script> tags. Unclosed <script> tags are ignored. |
* |
* - Any other invalid markup is ignored, including unclosed SGML |
* comments and unclosed <![CDATA[blocks. |
* |
* PHP versions 4 and 5 |
* |
* LICENSE: See the COPYING file included in this distribution. |
* |
* @access private |
* @package OpenID |
* @author JanRain, Inc. <openid@janrain.com> |
* @copyright 2005 Janrain, Inc. |
* @license http://www.gnu.org/copyleft/lesser.html LGPL |
*/ |
|
/** |
* Require Auth_OpenID::arrayGet(). |
*/ |
require_once "Auth/OpenID.php"; |
|
class Auth_OpenID_Parse { |
|
/** |
* Specify some flags for use with regex matching. |
*/ |
var $_re_flags = "si"; |
|
/** |
* Stuff to remove before we start looking for tags |
*/ |
var $_removed_re = |
"<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>"; |
|
/** |
* Starts with the tag name at a word boundary, where the tag name |
* is not a namespace |
*/ |
var $_tag_expr = "<%s\b(?!:)([^>]*?)(?:\/>|>(.*?)(?:<\/?%s\s*>|\Z))"; |
|
var $_attr_find = '\b(\w+)=("[^"]*"|\'[^\']*\'|[^\'"\s\/<>]+)'; |
|
function Auth_OpenID_Parse() |
{ |
$this->_link_find = sprintf("/<link\b(?!:)([^>]*)(?!<)>/%s", |
$this->_re_flags); |
|
$this->_entity_replacements = array( |
'amp' => '&', |
'lt' => '<', |
'gt' => '>', |
'quot' => '"' |
); |
|
$this->_attr_find = sprintf("/%s/%s", |
$this->_attr_find, |
$this->_re_flags); |
|
$this->_removed_re = sprintf("/%s/%s", |
$this->_removed_re, |
$this->_re_flags); |
|
$this->_ent_replace = |
sprintf("&(%s);", implode("|", |
$this->_entity_replacements)); |
} |
|
/** |
* Returns a regular expression that will match a given tag in an |
* SGML string. |
*/ |
function tagMatcher($tag_name, $close_tags = null) |
{ |
if ($close_tags) { |
$options = implode("|", array_merge(array($tag_name), $close_tags)); |
$closer = sprintf("(?:%s)", $options); |
} else { |
$closer = $tag_name; |
} |
|
$expr = sprintf($this->_tag_expr, $tag_name, $closer); |
return sprintf("/%s/%s", $expr, $this->_re_flags); |
} |
|
function htmlFind() |
{ |
return $this->tagMatcher('html'); |
} |
|
function headFind() |
{ |
return $this->tagMatcher('head', array('body')); |
} |
|
function replaceEntities($str) |
{ |
foreach ($this->_entity_replacements as $old => $new) { |
$str = preg_replace(sprintf("/&%s;/", $old), $new, $str); |
} |
return $str; |
} |
|
function removeQuotes($str) |
{ |
$matches = array(); |
$double = '/^"(.*)"$/'; |
$single = "/^\'(.*)\'$/"; |
|
if (preg_match($double, $str, $matches)) { |
return $matches[1]; |
} else if (preg_match($single, $str, $matches)) { |
return $matches[1]; |
} else { |
return $str; |
} |
} |
|
/** |
* Find all link tags in a string representing a HTML document and |
* return a list of their attributes. |
* |
* @param string $html The text to parse |
* @return array $list An array of arrays of attributes, one for each |
* link tag |
*/ |
function parseLinkAttrs($html) |
{ |
$stripped = preg_replace($this->_removed_re, |
"", |
$html); |
|
// Try to find the <HTML> tag. |
$html_re = $this->htmlFind(); |
$html_matches = array(); |
if (!preg_match($html_re, $stripped, $html_matches)) { |
return array(); |
} |
|
// Try to find the <HEAD> tag. |
$head_re = $this->headFind(); |
$head_matches = array(); |
if (!preg_match($head_re, $html_matches[0], $head_matches)) { |
return array(); |
} |
|
$link_data = array(); |
$link_matches = array(); |
|
if (!preg_match_all($this->_link_find, $head_matches[0], |
$link_matches)) { |
return array(); |
} |
|
foreach ($link_matches[0] as $link) { |
$attr_matches = array(); |
preg_match_all($this->_attr_find, $link, $attr_matches); |
$link_attrs = array(); |
foreach ($attr_matches[0] as $index => $full_match) { |
$name = $attr_matches[1][$index]; |
$value = $this->replaceEntities( |
$this->removeQuotes($attr_matches[2][$index])); |
|
$link_attrs[strtolower($name)] = $value; |
} |
$link_data[] = $link_attrs; |
} |
|
return $link_data; |
} |
|
function relMatches($rel_attr, $target_rel) |
{ |
// Does this target_rel appear in the rel_str? |
// XXX: TESTME |
$rels = preg_split("/\s+/", trim($rel_attr)); |
foreach ($rels as $rel) { |
$rel = strtolower($rel); |
if ($rel == $target_rel) { |
return 1; |
} |
} |
|
return 0; |
} |
|
function linkHasRel($link_attrs, $target_rel) |
{ |
// Does this link have target_rel as a relationship? |
// XXX: TESTME |
$rel_attr = Auth_OpeniD::arrayGet($link_attrs, 'rel', null); |
return ($rel_attr && $this->relMatches($rel_attr, |
$target_rel)); |
} |
|
function findLinksRel($link_attrs_list, $target_rel) |
{ |
// Filter the list of link attributes on whether it has |
// target_rel as a relationship. |
// XXX: TESTME |
$result = array(); |
foreach ($link_attrs_list as $attr) { |
if ($this->linkHasRel($attr, $target_rel)) { |
$result[] = $attr; |
} |
} |
|
return $result; |
} |
|
function findFirstHref($link_attrs_list, $target_rel) |
{ |
// Return the value of the href attribute for the first link |
// tag in the list that has target_rel as a relationship. |
// XXX: TESTME |
$matches = $this->findLinksRel($link_attrs_list, |
$target_rel); |
if (!$matches) { |
return null; |
} |
$first = $matches[0]; |
return Auth_OpenID::arrayGet($first, 'href', null); |
} |
} |
|
function Auth_OpenID_legacy_discover($html_text) |
{ |
$p = new Auth_OpenID_Parse(); |
|
$link_attrs = $p->parseLinkAttrs($html_text); |
|
$server_url = $p->findFirstHref($link_attrs, |
'openid.server'); |
|
if ($server_url === null) { |
return false; |
} else { |
$delegate_url = $p->findFirstHref($link_attrs, |
'openid.delegate'); |
return array($delegate_url, $server_url); |
} |
} |
|
?> |