770 |
florian |
1 |
<?php
|
|
|
2 |
/*
|
|
|
3 |
* Project: MagpieRSS: a simple RSS integration tool
|
|
|
4 |
* File: rss_fetch.inc, a simple functional interface
|
|
|
5 |
to fetching and parsing RSS files, via the
|
|
|
6 |
function fetch_rss()
|
|
|
7 |
* Author: Kellan Elliott-McCrea <kellan@protest.net>
|
|
|
8 |
* License: GPL
|
|
|
9 |
*
|
|
|
10 |
* The lastest version of MagpieRSS can be obtained from:
|
|
|
11 |
* http://magpierss.sourceforge.net
|
|
|
12 |
*
|
|
|
13 |
* For questions, help, comments, discussion, etc., please join the
|
|
|
14 |
* Magpie mailing list:
|
|
|
15 |
* magpierss-general@lists.sourceforge.net
|
|
|
16 |
*
|
|
|
17 |
*/
|
|
|
18 |
|
|
|
19 |
// Setup MAGPIE_DIR for use on hosts that don't include
|
|
|
20 |
// the current path in include_path.
|
|
|
21 |
// with thanks to rajiv and smarty
|
|
|
22 |
if (!defined('DIR_SEP')) {
|
|
|
23 |
define('DIR_SEP', DIRECTORY_SEPARATOR);
|
|
|
24 |
}
|
|
|
25 |
|
|
|
26 |
if (!defined('MAGPIE_DIR')) {
|
|
|
27 |
define('MAGPIE_DIR', dirname(__FILE__) . DIR_SEP);
|
|
|
28 |
}
|
|
|
29 |
|
|
|
30 |
require_once( MAGPIE_DIR . 'rss_parse.inc' );
|
|
|
31 |
require_once( MAGPIE_DIR . 'rss_cache.inc' );
|
|
|
32 |
|
|
|
33 |
// for including 3rd party libraries
|
|
|
34 |
define('MAGPIE_EXTLIB', MAGPIE_DIR . 'extlib' . DIR_SEP);
|
|
|
35 |
require_once( MAGPIE_EXTLIB . 'Snoopy.class.inc');
|
|
|
36 |
|
|
|
37 |
|
|
|
38 |
/*
|
|
|
39 |
* CONSTANTS - redefine these in your script to change the
|
|
|
40 |
* behaviour of fetch_rss() currently, most options effect the cache
|
|
|
41 |
*
|
|
|
42 |
* MAGPIE_CACHE_ON - Should Magpie cache parsed RSS objects?
|
|
|
43 |
* For me a built in cache was essential to creating a "PHP-like"
|
|
|
44 |
* feel to Magpie, see rss_cache.inc for rationale
|
|
|
45 |
*
|
|
|
46 |
*
|
|
|
47 |
* MAGPIE_CACHE_DIR - Where should Magpie cache parsed RSS objects?
|
|
|
48 |
* This should be a location that the webserver can write to. If this
|
|
|
49 |
* directory does not already exist Mapie will try to be smart and create
|
|
|
50 |
* it. This will often fail for permissions reasons.
|
|
|
51 |
*
|
|
|
52 |
*
|
|
|
53 |
* MAGPIE_CACHE_AGE - How long to store cached RSS objects? In seconds.
|
|
|
54 |
*
|
|
|
55 |
*
|
|
|
56 |
* MAGPIE_CACHE_FRESH_ONLY - If remote fetch fails, throw error
|
|
|
57 |
* instead of returning stale object?
|
|
|
58 |
*
|
|
|
59 |
* MAGPIE_DEBUG - Display debugging notices?
|
|
|
60 |
*
|
|
|
61 |
*/
|
|
|
62 |
|
|
|
63 |
|
|
|
64 |
/*=======================================================================*\
|
|
|
65 |
Function: fetch_rss:
|
|
|
66 |
Purpose: return RSS object for the give url
|
|
|
67 |
maintain the cache
|
|
|
68 |
Input: url of RSS file
|
|
|
69 |
Output: parsed RSS object (see rss_parse.inc)
|
|
|
70 |
|
|
|
71 |
NOTES ON CACHEING:
|
|
|
72 |
If caching is on (MAGPIE_CACHE_ON) fetch_rss will first check the cache.
|
|
|
73 |
|
|
|
74 |
NOTES ON RETRIEVING REMOTE FILES:
|
|
|
75 |
If conditional gets are on (MAGPIE_CONDITIONAL_GET_ON) fetch_rss will
|
|
|
76 |
return a cached object, and touch the cache object upon recieving a
|
|
|
77 |
304.
|
|
|
78 |
|
|
|
79 |
NOTES ON FAILED REQUESTS:
|
|
|
80 |
If there is an HTTP error while fetching an RSS object, the cached
|
|
|
81 |
version will be return, if it exists (and if MAGPIE_CACHE_FRESH_ONLY is off)
|
|
|
82 |
\*=======================================================================*/
|
|
|
83 |
|
|
|
84 |
define('MAGPIE_VERSION', '0.72');
|
|
|
85 |
|
|
|
86 |
$MAGPIE_ERROR = "";
|
|
|
87 |
|
|
|
88 |
function fetch_rss ($url) {
|
|
|
89 |
// initialize constants
|
|
|
90 |
init();
|
|
|
91 |
|
|
|
92 |
if ( !isset($url) ) {
|
|
|
93 |
error("fetch_rss called without a url");
|
|
|
94 |
return false;
|
|
|
95 |
}
|
|
|
96 |
|
|
|
97 |
// if cache is disabled
|
|
|
98 |
if ( !MAGPIE_CACHE_ON ) {
|
|
|
99 |
// fetch file, and parse it
|
|
|
100 |
$resp = _fetch_remote_file( $url );
|
|
|
101 |
if ( is_success( $resp->status ) ) {
|
|
|
102 |
return _response_to_rss( $resp );
|
|
|
103 |
}
|
|
|
104 |
else {
|
|
|
105 |
error("Failed to fetch $url and cache is off");
|
|
|
106 |
return false;
|
|
|
107 |
}
|
|
|
108 |
}
|
|
|
109 |
// else cache is ON
|
|
|
110 |
else {
|
|
|
111 |
// Flow
|
|
|
112 |
// 1. check cache
|
|
|
113 |
// 2. if there is a hit, make sure its fresh
|
|
|
114 |
// 3. if cached obj fails freshness check, fetch remote
|
|
|
115 |
// 4. if remote fails, return stale object, or error
|
|
|
116 |
|
|
|
117 |
$cache = new RSSCache( MAGPIE_CACHE_DIR, MAGPIE_CACHE_AGE );
|
|
|
118 |
|
|
|
119 |
if (MAGPIE_DEBUG and $cache->ERROR) {
|
|
|
120 |
debug($cache->ERROR, E_USER_WARNING);
|
|
|
121 |
}
|
|
|
122 |
|
|
|
123 |
|
|
|
124 |
$cache_status = 0; // response of check_cache
|
|
|
125 |
$request_headers = array(); // HTTP headers to send with fetch
|
|
|
126 |
$rss = 0; // parsed RSS object
|
|
|
127 |
$errormsg = 0; // errors, if any
|
|
|
128 |
|
|
|
129 |
// store parsed XML by desired output encoding
|
|
|
130 |
// as character munging happens at parse time
|
|
|
131 |
$cache_key = $url . MAGPIE_OUTPUT_ENCODING;
|
|
|
132 |
|
|
|
133 |
if (!$cache->ERROR) {
|
|
|
134 |
// return cache HIT, MISS, or STALE
|
|
|
135 |
$cache_status = $cache->check_cache( $cache_key);
|
|
|
136 |
}
|
|
|
137 |
|
|
|
138 |
// if object cached, and cache is fresh, return cached obj
|
|
|
139 |
if ( $cache_status == 'HIT' ) {
|
|
|
140 |
$rss = $cache->get( $cache_key );
|
|
|
141 |
if ( isset($rss) and $rss ) {
|
|
|
142 |
// should be cache age
|
|
|
143 |
$rss->from_cache = 1;
|
|
|
144 |
if ( MAGPIE_DEBUG > 1) {
|
|
|
145 |
debug("MagpieRSS: Cache HIT", E_USER_NOTICE);
|
|
|
146 |
}
|
|
|
147 |
return $rss;
|
|
|
148 |
}
|
|
|
149 |
}
|
|
|
150 |
|
|
|
151 |
// else attempt a conditional get
|
|
|
152 |
|
|
|
153 |
// setup headers
|
|
|
154 |
if ( $cache_status == 'STALE' ) {
|
|
|
155 |
$rss = $cache->get( $cache_key );
|
|
|
156 |
if ( $rss and $rss->etag and $rss->last_modified ) {
|
|
|
157 |
$request_headers['If-None-Match'] = $rss->etag;
|
|
|
158 |
$request_headers['If-Last-Modified'] = $rss->last_modified;
|
|
|
159 |
}
|
|
|
160 |
}
|
|
|
161 |
|
|
|
162 |
$resp = _fetch_remote_file( $url, $request_headers );
|
|
|
163 |
|
|
|
164 |
if (isset($resp) and $resp) {
|
|
|
165 |
if ($resp->status == '304' ) {
|
|
|
166 |
// we have the most current copy
|
|
|
167 |
if ( MAGPIE_DEBUG > 1) {
|
|
|
168 |
debug("Got 304 for $url");
|
|
|
169 |
}
|
|
|
170 |
// reset cache on 304 (at minutillo insistent prodding)
|
|
|
171 |
$cache->set($cache_key, $rss);
|
|
|
172 |
return $rss;
|
|
|
173 |
}
|
|
|
174 |
elseif ( is_success( $resp->status ) ) {
|
|
|
175 |
$rss = _response_to_rss( $resp );
|
|
|
176 |
if ( $rss ) {
|
|
|
177 |
if (MAGPIE_DEBUG > 1) {
|
|
|
178 |
debug("Fetch successful");
|
|
|
179 |
}
|
|
|
180 |
// add object to cache
|
|
|
181 |
$cache->set( $cache_key, $rss );
|
|
|
182 |
return $rss;
|
|
|
183 |
}
|
|
|
184 |
}
|
|
|
185 |
else {
|
|
|
186 |
$errormsg = "Failed to fetch $url ";
|
|
|
187 |
if ( $resp->status == '-100' ) {
|
|
|
188 |
$errormsg .= "(Request timed out after " . MAGPIE_FETCH_TIME_OUT . " seconds)";
|
|
|
189 |
}
|
|
|
190 |
elseif ( $resp->error ) {
|
|
|
191 |
# compensate for Snoopy's annoying habbit to tacking
|
|
|
192 |
# on '\n'
|
|
|
193 |
$http_error = substr($resp->error, 0, -2);
|
|
|
194 |
$errormsg .= "(HTTP Error: $http_error)";
|
|
|
195 |
}
|
|
|
196 |
else {
|
|
|
197 |
$errormsg .= "(HTTP Response: " . $resp->response_code .')';
|
|
|
198 |
}
|
|
|
199 |
}
|
|
|
200 |
}
|
|
|
201 |
else {
|
|
|
202 |
$errormsg = "Unable to retrieve RSS file for unknown reasons.";
|
|
|
203 |
}
|
|
|
204 |
|
|
|
205 |
// else fetch failed
|
|
|
206 |
|
|
|
207 |
// attempt to return cached object
|
|
|
208 |
if ($rss) {
|
|
|
209 |
if ( MAGPIE_DEBUG ) {
|
|
|
210 |
debug("Returning STALE object for $url");
|
|
|
211 |
}
|
|
|
212 |
return $rss;
|
|
|
213 |
}
|
|
|
214 |
|
|
|
215 |
// else we totally failed
|
|
|
216 |
error( $errormsg );
|
|
|
217 |
|
|
|
218 |
return false;
|
|
|
219 |
|
|
|
220 |
} // end if ( !MAGPIE_CACHE_ON ) {
|
|
|
221 |
} // end fetch_rss()
|
|
|
222 |
|
|
|
223 |
/*=======================================================================*\
|
|
|
224 |
Function: error
|
|
|
225 |
Purpose: set MAGPIE_ERROR, and trigger error
|
|
|
226 |
\*=======================================================================*/
|
|
|
227 |
|
|
|
228 |
function error ($errormsg, $lvl=E_USER_WARNING) {
|
|
|
229 |
global $MAGPIE_ERROR;
|
|
|
230 |
|
|
|
231 |
// append PHP's error message if track_errors enabled
|
|
|
232 |
if ( isset($php_errormsg) ) {
|
|
|
233 |
$errormsg .= " ($php_errormsg)";
|
|
|
234 |
}
|
|
|
235 |
if ( $errormsg ) {
|
|
|
236 |
$errormsg = "MagpieRSS: $errormsg";
|
|
|
237 |
$MAGPIE_ERROR = $errormsg;
|
|
|
238 |
trigger_error( $errormsg, $lvl);
|
|
|
239 |
}
|
|
|
240 |
}
|
|
|
241 |
|
|
|
242 |
function debug ($debugmsg, $lvl=E_USER_NOTICE) {
|
|
|
243 |
trigger_error("MagpieRSS [debug] $debugmsg", $lvl);
|
|
|
244 |
}
|
|
|
245 |
|
|
|
246 |
/*=======================================================================*\
|
|
|
247 |
Function: magpie_error
|
|
|
248 |
Purpose: accessor for the magpie error variable
|
|
|
249 |
\*=======================================================================*/
|
|
|
250 |
function magpie_error ($errormsg="") {
|
|
|
251 |
global $MAGPIE_ERROR;
|
|
|
252 |
|
|
|
253 |
if ( isset($errormsg) and $errormsg ) {
|
|
|
254 |
$MAGPIE_ERROR = $errormsg;
|
|
|
255 |
}
|
|
|
256 |
|
|
|
257 |
return $MAGPIE_ERROR;
|
|
|
258 |
}
|
|
|
259 |
|
|
|
260 |
/*=======================================================================*\
|
|
|
261 |
Function: _fetch_remote_file
|
|
|
262 |
Purpose: retrieve an arbitrary remote file
|
|
|
263 |
Input: url of the remote file
|
|
|
264 |
headers to send along with the request (optional)
|
|
|
265 |
Output: an HTTP response object (see Snoopy.class.inc)
|
|
|
266 |
\*=======================================================================*/
|
|
|
267 |
function _fetch_remote_file ($url, $headers = "" ) {
|
|
|
268 |
// Snoopy is an HTTP client in PHP
|
|
|
269 |
$client = new Snoopy();
|
|
|
270 |
$client->agent = MAGPIE_USER_AGENT;
|
|
|
271 |
$client->read_timeout = MAGPIE_FETCH_TIME_OUT;
|
|
|
272 |
$client->use_gzip = MAGPIE_USE_GZIP;
|
|
|
273 |
if (is_array($headers) ) {
|
|
|
274 |
$client->rawheaders = $headers;
|
|
|
275 |
}
|
|
|
276 |
|
|
|
277 |
@$client->fetch($url);
|
|
|
278 |
return $client;
|
|
|
279 |
|
|
|
280 |
}
|
|
|
281 |
|
|
|
282 |
/*=======================================================================*\
|
|
|
283 |
Function: _response_to_rss
|
|
|
284 |
Purpose: parse an HTTP response object into an RSS object
|
|
|
285 |
Input: an HTTP response object (see Snoopy)
|
|
|
286 |
Output: parsed RSS object (see rss_parse)
|
|
|
287 |
\*=======================================================================*/
|
|
|
288 |
function _response_to_rss ($resp) {
|
|
|
289 |
$rss = new MagpieRSS( $resp->results, MAGPIE_OUTPUT_ENCODING, MAGPIE_INPUT_ENCODING, MAGPIE_DETECT_ENCODING );
|
|
|
290 |
|
|
|
291 |
// if RSS parsed successfully
|
|
|
292 |
if ( $rss and !$rss->ERROR) {
|
|
|
293 |
|
|
|
294 |
// find Etag, and Last-Modified
|
|
|
295 |
foreach($resp->headers as $h) {
|
|
|
296 |
// 2003-03-02 - Nicola Asuni (www.tecnick.com) - fixed bug "Undefined offset: 1"
|
|
|
297 |
if (strpos($h, ": ")) {
|
|
|
298 |
list($field, $val) = explode(": ", $h, 2);
|
|
|
299 |
}
|
|
|
300 |
else {
|
|
|
301 |
$field = $h;
|
|
|
302 |
$val = "";
|
|
|
303 |
}
|
|
|
304 |
|
|
|
305 |
if ( $field == 'ETag' ) {
|
|
|
306 |
$rss->etag = $val;
|
|
|
307 |
}
|
|
|
308 |
|
|
|
309 |
if ( $field == 'Last-Modified' ) {
|
|
|
310 |
$rss->last_modified = $val;
|
|
|
311 |
}
|
|
|
312 |
}
|
|
|
313 |
|
|
|
314 |
return $rss;
|
|
|
315 |
} // else construct error message
|
|
|
316 |
else {
|
|
|
317 |
$errormsg = "Failed to parse RSS file.";
|
|
|
318 |
|
|
|
319 |
if ($rss) {
|
|
|
320 |
$errormsg .= " (" . $rss->ERROR . ")";
|
|
|
321 |
}
|
|
|
322 |
error($errormsg);
|
|
|
323 |
|
|
|
324 |
return false;
|
|
|
325 |
} // end if ($rss and !$rss->error)
|
|
|
326 |
}
|
|
|
327 |
|
|
|
328 |
/*=======================================================================*\
|
|
|
329 |
Function: init
|
|
|
330 |
Purpose: setup constants with default values
|
|
|
331 |
check for user overrides
|
|
|
332 |
\*=======================================================================*/
|
|
|
333 |
function init () {
|
|
|
334 |
if ( defined('MAGPIE_INITALIZED') ) {
|
|
|
335 |
return;
|
|
|
336 |
}
|
|
|
337 |
else {
|
|
|
338 |
define('MAGPIE_INITALIZED', true);
|
|
|
339 |
}
|
|
|
340 |
|
|
|
341 |
if ( !defined('MAGPIE_CACHE_ON') ) {
|
968 |
florian |
342 |
define('MAGPIE_CACHE_ON', false);
|
770 |
florian |
343 |
}
|
|
|
344 |
|
|
|
345 |
if ( !defined('MAGPIE_CACHE_DIR') ) {
|
|
|
346 |
define('MAGPIE_CACHE_DIR', './cache');
|
|
|
347 |
}
|
|
|
348 |
|
|
|
349 |
if ( !defined('MAGPIE_CACHE_AGE') ) {
|
|
|
350 |
define('MAGPIE_CACHE_AGE', 60*60); // one hour
|
|
|
351 |
}
|
|
|
352 |
|
|
|
353 |
if ( !defined('MAGPIE_CACHE_FRESH_ONLY') ) {
|
|
|
354 |
define('MAGPIE_CACHE_FRESH_ONLY', false);
|
|
|
355 |
}
|
|
|
356 |
|
|
|
357 |
if ( !defined('MAGPIE_OUTPUT_ENCODING') ) {
|
|
|
358 |
define('MAGPIE_OUTPUT_ENCODING', 'ISO-8859-1');
|
|
|
359 |
}
|
|
|
360 |
|
|
|
361 |
if ( !defined('MAGPIE_INPUT_ENCODING') ) {
|
|
|
362 |
define('MAGPIE_INPUT_ENCODING', null);
|
|
|
363 |
}
|
|
|
364 |
|
|
|
365 |
if ( !defined('MAGPIE_DETECT_ENCODING') ) {
|
|
|
366 |
define('MAGPIE_DETECT_ENCODING', true);
|
|
|
367 |
}
|
|
|
368 |
|
|
|
369 |
if ( !defined('MAGPIE_DEBUG') ) {
|
|
|
370 |
define('MAGPIE_DEBUG', 0);
|
|
|
371 |
}
|
|
|
372 |
|
|
|
373 |
if ( !defined('MAGPIE_USER_AGENT') ) {
|
|
|
374 |
$ua = 'MagpieRSS/'. MAGPIE_VERSION . ' (+http://magpierss.sf.net';
|
|
|
375 |
|
|
|
376 |
if ( MAGPIE_CACHE_ON ) {
|
|
|
377 |
$ua = $ua . ')';
|
|
|
378 |
}
|
|
|
379 |
else {
|
|
|
380 |
$ua = $ua . '; No cache)';
|
|
|
381 |
}
|
|
|
382 |
|
|
|
383 |
define('MAGPIE_USER_AGENT', $ua);
|
|
|
384 |
}
|
|
|
385 |
|
|
|
386 |
if ( !defined('MAGPIE_FETCH_TIME_OUT') ) {
|
|
|
387 |
define('MAGPIE_FETCH_TIME_OUT', 5); // 5 second timeout
|
|
|
388 |
}
|
|
|
389 |
|
|
|
390 |
// use gzip encoding to fetch rss files if supported?
|
|
|
391 |
if ( !defined('MAGPIE_USE_GZIP') ) {
|
|
|
392 |
define('MAGPIE_USE_GZIP', true);
|
|
|
393 |
}
|
|
|
394 |
}
|
|
|
395 |
|
|
|
396 |
// NOTE: the following code should really be in Snoopy, or at least
|
|
|
397 |
// somewhere other then rss_fetch!
|
|
|
398 |
|
|
|
399 |
/*=======================================================================*\
|
|
|
400 |
HTTP STATUS CODE PREDICATES
|
|
|
401 |
These functions attempt to classify an HTTP status code
|
|
|
402 |
based on RFC 2616 and RFC 2518.
|
|
|
403 |
|
|
|
404 |
All of them take an HTTP status code as input, and return true or false
|
|
|
405 |
|
|
|
406 |
All this code is adapted from LWP's HTTP::Status.
|
|
|
407 |
\*=======================================================================*/
|
|
|
408 |
|
|
|
409 |
|
|
|
410 |
/*=======================================================================*\
|
|
|
411 |
Function: is_info
|
|
|
412 |
Purpose: return true if Informational status code
|
|
|
413 |
\*=======================================================================*/
|
|
|
414 |
function is_info ($sc) {
|
|
|
415 |
return $sc >= 100 && $sc < 200;
|
|
|
416 |
}
|
|
|
417 |
|
|
|
418 |
/*=======================================================================*\
|
|
|
419 |
Function: is_success
|
|
|
420 |
Purpose: return true if Successful status code
|
|
|
421 |
\*=======================================================================*/
|
|
|
422 |
function is_success ($sc) {
|
|
|
423 |
return $sc >= 200 && $sc < 300;
|
|
|
424 |
}
|
|
|
425 |
|
|
|
426 |
/*=======================================================================*\
|
|
|
427 |
Function: is_redirect
|
|
|
428 |
Purpose: return true if Redirection status code
|
|
|
429 |
\*=======================================================================*/
|
|
|
430 |
function is_redirect ($sc) {
|
|
|
431 |
return $sc >= 300 && $sc < 400;
|
|
|
432 |
}
|
|
|
433 |
|
|
|
434 |
/*=======================================================================*\
|
|
|
435 |
Function: is_error
|
|
|
436 |
Purpose: return true if Error status code
|
|
|
437 |
\*=======================================================================*/
|
|
|
438 |
function is_error ($sc) {
|
|
|
439 |
return $sc >= 400 && $sc < 600;
|
|
|
440 |
}
|
|
|
441 |
|
|
|
442 |
/*=======================================================================*\
|
|
|
443 |
Function: is_client_error
|
|
|
444 |
Purpose: return true if Error status code, and its a client error
|
|
|
445 |
\*=======================================================================*/
|
|
|
446 |
function is_client_error ($sc) {
|
|
|
447 |
return $sc >= 400 && $sc < 500;
|
|
|
448 |
}
|
|
|
449 |
|
|
|
450 |
/*=======================================================================*\
|
|
|
451 |
Function: is_client_error
|
|
|
452 |
Purpose: return true if Error status code, and its a server error
|
|
|
453 |
\*=======================================================================*/
|
|
|
454 |
function is_server_error ($sc) {
|
|
|
455 |
return $sc >= 500 && $sc < 600;
|
|
|
456 |
}
|
|
|
457 |
|
|
|
458 |
?>
|