Subversion Repositories Applications.papyrus

Rev

Rev 770 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
770 florian 1
<?php
2
/*
3
 * Project:     MagpieRSS: a simple RSS integration tool
4
 * File:        rss_fetch.inc, a simple functional interface
5
                to fetching and parsing RSS files, via the
6
                function fetch_rss()
7
 * Author:      Kellan Elliott-McCrea <kellan@protest.net>
8
 * License:     GPL
9
 *
10
 * The lastest version of MagpieRSS can be obtained from:
11
 * http://magpierss.sourceforge.net
12
 *
13
 * For questions, help, comments, discussion, etc., please join the
14
 * Magpie mailing list:
15
 * magpierss-general@lists.sourceforge.net
16
 *
17
 */
18
 
19
// Setup MAGPIE_DIR for use on hosts that don't include
20
// the current path in include_path.
21
// with thanks to rajiv and smarty
22
if (!defined('DIR_SEP')) {
23
    define('DIR_SEP', DIRECTORY_SEPARATOR);
24
}
25
 
26
if (!defined('MAGPIE_DIR')) {
27
    define('MAGPIE_DIR', dirname(__FILE__) . DIR_SEP);
28
}
29
 
30
require_once( MAGPIE_DIR . 'rss_parse.inc' );
31
require_once( MAGPIE_DIR . 'rss_cache.inc' );
32
 
33
// for including 3rd party libraries
34
define('MAGPIE_EXTLIB', MAGPIE_DIR . 'extlib' . DIR_SEP);
35
require_once( MAGPIE_EXTLIB . 'Snoopy.class.inc');
36
 
37
 
38
/*
39
 * CONSTANTS - redefine these in your script to change the
40
 * behaviour of fetch_rss() currently, most options effect the cache
41
 *
42
 * MAGPIE_CACHE_ON - Should Magpie cache parsed RSS objects?
43
 * For me a built in cache was essential to creating a "PHP-like"
44
 * feel to Magpie, see rss_cache.inc for rationale
45
 *
46
 *
47
 * MAGPIE_CACHE_DIR - Where should Magpie cache parsed RSS objects?
48
 * This should be a location that the webserver can write to.   If this
49
 * directory does not already exist Mapie will try to be smart and create
50
 * it.  This will often fail for permissions reasons.
51
 *
52
 *
53
 * MAGPIE_CACHE_AGE - How long to store cached RSS objects? In seconds.
54
 *
55
 *
56
 * MAGPIE_CACHE_FRESH_ONLY - If remote fetch fails, throw error
57
 * instead of returning stale object?
58
 *
59
 * MAGPIE_DEBUG - Display debugging notices?
60
 *
61
*/
62
 
63
 
64
/*=======================================================================*\
65
    Function: fetch_rss:
66
    Purpose:  return RSS object for the give url
67
              maintain the cache
68
    Input:    url of RSS file
69
    Output:   parsed RSS object (see rss_parse.inc)
70
 
71
    NOTES ON CACHEING:
72
    If caching is on (MAGPIE_CACHE_ON) fetch_rss will first check the cache.
73
 
74
    NOTES ON RETRIEVING REMOTE FILES:
75
    If conditional gets are on (MAGPIE_CONDITIONAL_GET_ON) fetch_rss will
76
    return a cached object, and touch the cache object upon recieving a
77
    304.
78
 
79
    NOTES ON FAILED REQUESTS:
80
    If there is an HTTP error while fetching an RSS object, the cached
81
    version will be return, if it exists (and if MAGPIE_CACHE_FRESH_ONLY is off)
82
\*=======================================================================*/
83
 
84
define('MAGPIE_VERSION', '0.72');
85
 
86
$MAGPIE_ERROR = "";
87
 
88
function fetch_rss ($url) {
89
    // initialize constants
90
    init();
91
 
92
    if ( !isset($url) ) {
93
        error("fetch_rss called without a url");
94
        return false;
95
    }
96
 
97
    // if cache is disabled
98
    if ( !MAGPIE_CACHE_ON ) {
99
        // fetch file, and parse it
100
        $resp = _fetch_remote_file( $url );
101
        if ( is_success( $resp->status ) ) {
102
            return _response_to_rss( $resp );
103
        }
104
        else {
105
            error("Failed to fetch $url and cache is off");
106
            return false;
107
        }
108
    }
109
    // else cache is ON
110
    else {
111
        // Flow
112
        // 1. check cache
113
        // 2. if there is a hit, make sure its fresh
114
        // 3. if cached obj fails freshness check, fetch remote
115
        // 4. if remote fails, return stale object, or error
116
 
117
        $cache = new RSSCache( MAGPIE_CACHE_DIR, MAGPIE_CACHE_AGE );
118
 
119
        if (MAGPIE_DEBUG and $cache->ERROR) {
120
            debug($cache->ERROR, E_USER_WARNING);
121
        }
122
 
123
 
124
        $cache_status    = 0;       // response of check_cache
125
        $request_headers = array(); // HTTP headers to send with fetch
126
        $rss             = 0;       // parsed RSS object
127
        $errormsg        = 0;       // errors, if any
128
 
129
        // store parsed XML by desired output encoding
130
        // as character munging happens at parse time
131
        $cache_key       = $url . MAGPIE_OUTPUT_ENCODING;
132
 
133
        if (!$cache->ERROR) {
134
            // return cache HIT, MISS, or STALE
135
            $cache_status = $cache->check_cache( $cache_key);
136
        }
137
 
138
        // if object cached, and cache is fresh, return cached obj
139
        if ( $cache_status == 'HIT' ) {
140
            $rss = $cache->get( $cache_key );
141
            if ( isset($rss) and $rss ) {
142
                // should be cache age
143
                $rss->from_cache = 1;
144
                if ( MAGPIE_DEBUG > 1) {
145
                    debug("MagpieRSS: Cache HIT", E_USER_NOTICE);
146
                }
147
                return $rss;
148
            }
149
        }
150
 
151
        // else attempt a conditional get
152
 
153
        // setup headers
154
        if ( $cache_status == 'STALE' ) {
155
            $rss = $cache->get( $cache_key );
156
            if ( $rss and $rss->etag and $rss->last_modified ) {
157
                $request_headers['If-None-Match'] = $rss->etag;
158
                $request_headers['If-Last-Modified'] = $rss->last_modified;
159
            }
160
        }
161
 
162
        $resp = _fetch_remote_file( $url, $request_headers );
163
 
164
        if (isset($resp) and $resp) {
165
          if ($resp->status == '304' ) {
166
                // we have the most current copy
167
                if ( MAGPIE_DEBUG > 1) {
168
                    debug("Got 304 for $url");
169
                }
170
                // reset cache on 304 (at minutillo insistent prodding)
171
                $cache->set($cache_key, $rss);
172
                return $rss;
173
            }
174
            elseif ( is_success( $resp->status ) ) {
175
                $rss = _response_to_rss( $resp );
176
                if ( $rss ) {
177
                    if (MAGPIE_DEBUG > 1) {
178
                        debug("Fetch successful");
179
                    }
180
                    // add object to cache
181
                    $cache->set( $cache_key, $rss );
182
                    return $rss;
183
                }
184
            }
185
            else {
186
                $errormsg = "Failed to fetch $url ";
187
                if ( $resp->status == '-100' ) {
188
                    $errormsg .= "(Request timed out after " . MAGPIE_FETCH_TIME_OUT . " seconds)";
189
                }
190
                elseif ( $resp->error ) {
191
                    # compensate for Snoopy's annoying habbit to tacking
192
                    # on '\n'
193
                    $http_error = substr($resp->error, 0, -2);
194
                    $errormsg .= "(HTTP Error: $http_error)";
195
                }
196
                else {
197
                    $errormsg .=  "(HTTP Response: " . $resp->response_code .')';
198
                }
199
            }
200
        }
201
        else {
202
            $errormsg = "Unable to retrieve RSS file for unknown reasons.";
203
        }
204
 
205
        // else fetch failed
206
 
207
        // attempt to return cached object
208
        if ($rss) {
209
            if ( MAGPIE_DEBUG ) {
210
                debug("Returning STALE object for $url");
211
            }
212
            return $rss;
213
        }
214
 
215
        // else we totally failed
216
        error( $errormsg );
217
 
218
        return false;
219
 
220
    } // end if ( !MAGPIE_CACHE_ON ) {
221
} // end fetch_rss()
222
 
223
/*=======================================================================*\
224
    Function:   error
225
    Purpose:    set MAGPIE_ERROR, and trigger error
226
\*=======================================================================*/
227
 
228
function error ($errormsg, $lvl=E_USER_WARNING) {
229
        global $MAGPIE_ERROR;
230
 
231
        // append PHP's error message if track_errors enabled
232
        if ( isset($php_errormsg) ) {
233
            $errormsg .= " ($php_errormsg)";
234
        }
235
        if ( $errormsg ) {
236
            $errormsg = "MagpieRSS: $errormsg";
237
            $MAGPIE_ERROR = $errormsg;
238
            trigger_error( $errormsg, $lvl);
239
        }
240
}
241
 
242
function debug ($debugmsg, $lvl=E_USER_NOTICE) {
243
    trigger_error("MagpieRSS [debug] $debugmsg", $lvl);
244
}
245
 
246
/*=======================================================================*\
247
    Function:   magpie_error
248
    Purpose:    accessor for the magpie error variable
249
\*=======================================================================*/
250
function magpie_error ($errormsg="") {
251
    global $MAGPIE_ERROR;
252
 
253
    if ( isset($errormsg) and $errormsg ) {
254
        $MAGPIE_ERROR = $errormsg;
255
    }
256
 
257
    return $MAGPIE_ERROR;
258
}
259
 
260
/*=======================================================================*\
261
    Function:   _fetch_remote_file
262
    Purpose:    retrieve an arbitrary remote file
263
    Input:      url of the remote file
264
                headers to send along with the request (optional)
265
    Output:     an HTTP response object (see Snoopy.class.inc)
266
\*=======================================================================*/
267
function _fetch_remote_file ($url, $headers = "" ) {
268
    // Snoopy is an HTTP client in PHP
269
    $client = new Snoopy();
270
    $client->agent = MAGPIE_USER_AGENT;
271
    $client->read_timeout = MAGPIE_FETCH_TIME_OUT;
272
    $client->use_gzip = MAGPIE_USE_GZIP;
273
    if (is_array($headers) ) {
274
        $client->rawheaders = $headers;
275
    }
276
 
277
    @$client->fetch($url);
278
    return $client;
279
 
280
}
281
 
282
/*=======================================================================*\
283
    Function:   _response_to_rss
284
    Purpose:    parse an HTTP response object into an RSS object
285
    Input:      an HTTP response object (see Snoopy)
286
    Output:     parsed RSS object (see rss_parse)
287
\*=======================================================================*/
288
function _response_to_rss ($resp) {
289
    $rss = new MagpieRSS( $resp->results, MAGPIE_OUTPUT_ENCODING, MAGPIE_INPUT_ENCODING, MAGPIE_DETECT_ENCODING );
290
 
291
    // if RSS parsed successfully
292
    if ( $rss and !$rss->ERROR) {
293
 
294
        // find Etag, and Last-Modified
295
        foreach($resp->headers as $h) {
296
            // 2003-03-02 - Nicola Asuni (www.tecnick.com) - fixed bug "Undefined offset: 1"
297
            if (strpos($h, ": ")) {
298
                list($field, $val) = explode(": ", $h, 2);
299
            }
300
            else {
301
                $field = $h;
302
                $val = "";
303
            }
304
 
305
            if ( $field == 'ETag' ) {
306
                $rss->etag = $val;
307
            }
308
 
309
            if ( $field == 'Last-Modified' ) {
310
                $rss->last_modified = $val;
311
            }
312
        }
313
 
314
        return $rss;
315
    } // else construct error message
316
    else {
317
        $errormsg = "Failed to parse RSS file.";
318
 
319
        if ($rss) {
320
            $errormsg .= " (" . $rss->ERROR . ")";
321
        }
322
        error($errormsg);
323
 
324
        return false;
325
    } // end if ($rss and !$rss->error)
326
}
327
 
328
/*=======================================================================*\
329
    Function:   init
330
    Purpose:    setup constants with default values
331
                check for user overrides
332
\*=======================================================================*/
333
function init () {
334
    if ( defined('MAGPIE_INITALIZED') ) {
335
        return;
336
    }
337
    else {
338
        define('MAGPIE_INITALIZED', true);
339
    }
340
 
341
    if ( !defined('MAGPIE_CACHE_ON') ) {
968 florian 342
        define('MAGPIE_CACHE_ON', false);
770 florian 343
    }
344
 
345
    if ( !defined('MAGPIE_CACHE_DIR') ) {
346
        define('MAGPIE_CACHE_DIR', './cache');
347
    }
348
 
349
    if ( !defined('MAGPIE_CACHE_AGE') ) {
350
        define('MAGPIE_CACHE_AGE', 60*60); // one hour
351
    }
352
 
353
    if ( !defined('MAGPIE_CACHE_FRESH_ONLY') ) {
354
        define('MAGPIE_CACHE_FRESH_ONLY', false);
355
    }
356
 
357
    if ( !defined('MAGPIE_OUTPUT_ENCODING') ) {
358
        define('MAGPIE_OUTPUT_ENCODING', 'ISO-8859-1');
359
    }
360
 
361
    if ( !defined('MAGPIE_INPUT_ENCODING') ) {
362
        define('MAGPIE_INPUT_ENCODING', null);
363
    }
364
 
365
    if ( !defined('MAGPIE_DETECT_ENCODING') ) {
366
        define('MAGPIE_DETECT_ENCODING', true);
367
    }
368
 
369
    if ( !defined('MAGPIE_DEBUG') ) {
370
        define('MAGPIE_DEBUG', 0);
371
    }
372
 
373
    if ( !defined('MAGPIE_USER_AGENT') ) {
374
        $ua = 'MagpieRSS/'. MAGPIE_VERSION . ' (+http://magpierss.sf.net';
375
 
376
        if ( MAGPIE_CACHE_ON ) {
377
            $ua = $ua . ')';
378
        }
379
        else {
380
            $ua = $ua . '; No cache)';
381
        }
382
 
383
        define('MAGPIE_USER_AGENT', $ua);
384
    }
385
 
386
    if ( !defined('MAGPIE_FETCH_TIME_OUT') ) {
387
        define('MAGPIE_FETCH_TIME_OUT', 5); // 5 second timeout
388
    }
389
 
390
    // use gzip encoding to fetch rss files if supported?
391
    if ( !defined('MAGPIE_USE_GZIP') ) {
392
        define('MAGPIE_USE_GZIP', true);
393
    }
394
}
395
 
396
// NOTE: the following code should really be in Snoopy, or at least
397
// somewhere other then rss_fetch!
398
 
399
/*=======================================================================*\
400
    HTTP STATUS CODE PREDICATES
401
    These functions attempt to classify an HTTP status code
402
    based on RFC 2616 and RFC 2518.
403
 
404
    All of them take an HTTP status code as input, and return true or false
405
 
406
    All this code is adapted from LWP's HTTP::Status.
407
\*=======================================================================*/
408
 
409
 
410
/*=======================================================================*\
411
    Function:   is_info
412
    Purpose:    return true if Informational status code
413
\*=======================================================================*/
414
function is_info ($sc) {
415
    return $sc >= 100 && $sc < 200;
416
}
417
 
418
/*=======================================================================*\
419
    Function:   is_success
420
    Purpose:    return true if Successful status code
421
\*=======================================================================*/
422
function is_success ($sc) {
423
    return $sc >= 200 && $sc < 300;
424
}
425
 
426
/*=======================================================================*\
427
    Function:   is_redirect
428
    Purpose:    return true if Redirection status code
429
\*=======================================================================*/
430
function is_redirect ($sc) {
431
    return $sc >= 300 && $sc < 400;
432
}
433
 
434
/*=======================================================================*\
435
    Function:   is_error
436
    Purpose:    return true if Error status code
437
\*=======================================================================*/
438
function is_error ($sc) {
439
    return $sc >= 400 && $sc < 600;
440
}
441
 
442
/*=======================================================================*\
443
    Function:   is_client_error
444
    Purpose:    return true if Error status code, and its a client error
445
\*=======================================================================*/
446
function is_client_error ($sc) {
447
    return $sc >= 400 && $sc < 500;
448
}
449
 
450
/*=======================================================================*\
451
    Function:   is_client_error
452
    Purpose:    return true if Error status code, and its a server error
453
\*=======================================================================*/
454
function is_server_error ($sc) {
455
    return $sc >= 500 && $sc < 600;
456
}
457
 
458
?>