Blame | Last modification | View Log | RSS feed
<?php/*** This module implements a VERY limited parser that finds <link> tags* in the head of HTML or XHTML documents and parses out their* attributes according to the OpenID spec. It is a liberal parser,* but it requires these things from the data in order to work:** - There must be an open <html> tag** - There must be an open <head> tag inside of the <html> tag** - Only <link>s that are found inside of the <head> tag are parsed* (this is by design)** - The parser follows the OpenID specification in resolving the* attributes of the link tags. This means that the attributes DO* NOT get resolved as they would by an XML or HTML parser. In* particular, only certain entities get replaced, and href* attributes do not get resolved relative to a base URL.** From http://openid.net/specs.bml:** - The openid.server URL MUST be an absolute URL. OpenID consumers* MUST NOT attempt to resolve relative URLs.** - The openid.server URL MUST NOT include entities other than &,* <, >, and ".** The parser ignores SGML comments and <![CDATA[blocks]]>. Both kinds* of quoting are allowed for attributes.** The parser deals with invalid markup in these ways:** - Tag names are not case-sensitive** - The <html> tag is accepted even when it is not at the top level** - The <head> tag is accepted even when it is not a direct child of* the <html> tag, but a <html> tag must be an ancestor of the* <head> tag** - <link> tags are accepted even when they are not direct children* of the <head> tag, but a <head> tag must be an ancestor of the* <link> tag** - If there is no closing tag for an open <html> or <head> tag, the* remainder of the document is viewed as being inside of the* tag. If there is no closing tag for a <link> tag, the link tag is* treated as a short tag. Exceptions to this rule are that <html>* closes <html> and <body> or <head> closes <head>** - Attributes of the <link> tag are not required to be quoted.** - In the case of duplicated attribute names, the attribute coming* last in the tag will be the value returned.** - Any text that does not parse as an attribute within a link tag* will be ignored. (e.g. <link pumpkin rel='openid.server' /> will* ignore pumpkin)** - If there are more than one <html> or <head> tag, the parser only* looks inside of the first one.** - The contents of <script> tags are ignored entirely, except* unclosed <script> tags. Unclosed <script> tags are ignored.** - Any other invalid markup is ignored, including unclosed SGML* comments and unclosed <![CDATA[blocks.** PHP versions 4 and 5** LICENSE: See the COPYING file included in this distribution.** @access private* @package OpenID* @author JanRain, Inc. <openid@janrain.com>* @copyright 2005 Janrain, Inc.* @license http://www.gnu.org/copyleft/lesser.html LGPL*//*** Require Auth_OpenID::arrayGet().*/require_once "Auth/OpenID.php";class Auth_OpenID_Parse {/*** Specify some flags for use with regex matching.*/var $_re_flags = "si";/*** Stuff to remove before we start looking for tags*/var $_removed_re ="<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>";/*** Starts with the tag name at a word boundary, where the tag name* is not a namespace*/var $_tag_expr = "<%s\b(?!:)([^>]*?)(?:\/>|>(.*?)(?:<\/?%s\s*>|\Z))";var $_attr_find = '\b(\w+)=("[^"]*"|\'[^\']*\'|[^\'"\s\/<>]+)';function Auth_OpenID_Parse(){$this->_link_find = sprintf("/<link\b(?!:)([^>]*)(?!<)>/%s",$this->_re_flags);$this->_entity_replacements = array('amp' => '&','lt' => '<','gt' => '>','quot' => '"');$this->_attr_find = sprintf("/%s/%s",$this->_attr_find,$this->_re_flags);$this->_removed_re = sprintf("/%s/%s",$this->_removed_re,$this->_re_flags);$this->_ent_replace =sprintf("&(%s);", implode("|",$this->_entity_replacements));}/*** Returns a regular expression that will match a given tag in an* SGML string.*/function tagMatcher($tag_name, $close_tags = null){if ($close_tags) {$options = implode("|", array_merge(array($tag_name), $close_tags));$closer = sprintf("(?:%s)", $options);} else {$closer = $tag_name;}$expr = sprintf($this->_tag_expr, $tag_name, $closer);return sprintf("/%s/%s", $expr, $this->_re_flags);}function htmlFind(){return $this->tagMatcher('html');}function headFind(){return $this->tagMatcher('head', array('body'));}function replaceEntities($str){foreach ($this->_entity_replacements as $old => $new) {$str = preg_replace(sprintf("/&%s;/", $old), $new, $str);}return $str;}function removeQuotes($str){$matches = array();$double = '/^"(.*)"$/';$single = "/^\'(.*)\'$/";if (preg_match($double, $str, $matches)) {return $matches[1];} else if (preg_match($single, $str, $matches)) {return $matches[1];} else {return $str;}}/*** Find all link tags in a string representing a HTML document and* return a list of their attributes.** @param string $html The text to parse* @return array $list An array of arrays of attributes, one for each* link tag*/function parseLinkAttrs($html){$stripped = preg_replace($this->_removed_re,"",$html);// Try to find the <HTML> tag.$html_re = $this->htmlFind();$html_matches = array();if (!preg_match($html_re, $stripped, $html_matches)) {return array();}// Try to find the <HEAD> tag.$head_re = $this->headFind();$head_matches = array();if (!preg_match($head_re, $html_matches[0], $head_matches)) {return array();}$link_data = array();$link_matches = array();if (!preg_match_all($this->_link_find, $head_matches[0],$link_matches)) {return array();}foreach ($link_matches[0] as $link) {$attr_matches = array();preg_match_all($this->_attr_find, $link, $attr_matches);$link_attrs = array();foreach ($attr_matches[0] as $index => $full_match) {$name = $attr_matches[1][$index];$value = $this->replaceEntities($this->removeQuotes($attr_matches[2][$index]));$link_attrs[strtolower($name)] = $value;}$link_data[] = $link_attrs;}return $link_data;}function relMatches($rel_attr, $target_rel){// Does this target_rel appear in the rel_str?// XXX: TESTME$rels = preg_split("/\s+/", trim($rel_attr));foreach ($rels as $rel) {$rel = strtolower($rel);if ($rel == $target_rel) {return 1;}}return 0;}function linkHasRel($link_attrs, $target_rel){// Does this link have target_rel as a relationship?// XXX: TESTME$rel_attr = Auth_OpeniD::arrayGet($link_attrs, 'rel', null);return ($rel_attr && $this->relMatches($rel_attr,$target_rel));}function findLinksRel($link_attrs_list, $target_rel){// Filter the list of link attributes on whether it has// target_rel as a relationship.// XXX: TESTME$result = array();foreach ($link_attrs_list as $attr) {if ($this->linkHasRel($attr, $target_rel)) {$result[] = $attr;}}return $result;}function findFirstHref($link_attrs_list, $target_rel){// Return the value of the href attribute for the first link// tag in the list that has target_rel as a relationship.// XXX: TESTME$matches = $this->findLinksRel($link_attrs_list,$target_rel);if (!$matches) {return null;}$first = $matches[0];return Auth_OpenID::arrayGet($first, 'href', null);}}function Auth_OpenID_legacy_discover($html_text){$p = new Auth_OpenID_Parse();$link_attrs = $p->parseLinkAttrs($html_text);$server_url = $p->findFirstHref($link_attrs,'openid.server');if ($server_url === null) {return false;} else {$delegate_url = $p->findFirstHref($link_attrs,'openid.delegate');return array($delegate_url, $server_url);}}?>