Review Board

beta

Add the djblets.feedview app for viewing RSS/Atom feeds

Updated 7 months ago

Christian Hammond Reviewers
trunk reviewboard
None Navi
This introduces the new djblets.feedview app, which is designed to allow sites to embed RSS/Atom feeds. We make use of the single-file MIT-licensed "feedparser" module for this, which we ship along with feedview.

This is to be used in the new admin UI for displaying Review Board news updates on the new dashboard.
Tested along with the new admin UI. Viewing feeds works fine.
/trunk/djblets/djblets/feedview/feedparser.py
New File
1
#!/usr/bin/env python
2
"""Universal feed parser
3
4
Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
5
6
Visit http://feedparser.org/ for the latest version
7
Visit http://feedparser.org/docs/ for the latest documentation
8
9
Required: Python 2.1 or later
10
Recommended: Python 2.3 or later
11
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
12
"""
13
14
__version__ = "4.2-pre-" + "$Revision$"[11:14] + "-svn"
15
__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
16
17
Redistribution and use in source and binary forms, with or without modification,
18
are permitted provided that the following conditions are met:
19
20
* Redistributions of source code must retain the above copyright notice,
21
  this list of conditions and the following disclaimer.
22
* Redistributions in binary form must reproduce the above copyright notice,
23
  this list of conditions and the following disclaimer in the documentation
24
  and/or other materials provided with the distribution.
25
26
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
27
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36
POSSIBILITY OF SUCH DAMAGE."""
37
__author__ = "Mark Pilgrim <http://diveintomark.org/>"
38
__contributors__ = ["Jason Diamond <http://injektilo.org/>",
39
                    "John Beimler <http://john.beimler.org/>",
40
                    "Fazal Majid <http://www.majid.info/mylos/weblog/>",
41
                    "Aaron Swartz <http://aaronsw.com/>",
42
                    "Kevin Marks <http://epeus.blogspot.com/>",
43
                    "Sam Ruby <http://intertwingly.net/>"]
44
_debug = 0
45
46
# HTTP "User-Agent" header to send to servers when downloading feeds.
47
# If you are embedding feedparser in a larger application, you should
48
# change this to your application name and URL.
49
USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
50
51
# HTTP "Accept" header to send to servers when downloading feeds.  If you don't
52
# want to send an Accept header, set this to None.
53
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
54
55
# List of preferred XML parsers, by SAX driver name.  These will be tried first,
56
# but if they're not installed, Python will keep searching through its own list
57
# of pre-installed parsers until it finds one that supports everything we need.
58
PREFERRED_XML_PARSERS = ["drv_libxml2"]
59
60
# If you want feedparser to automatically run HTML markup through HTML Tidy, set
61
# this to 1.  Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
62
# or utidylib <http://utidylib.berlios.de/>.
63
TIDY_MARKUP = 0
64
65
# List of Python interfaces for HTML Tidy, in order of preference.  Only useful
66
# if TIDY_MARKUP = 1
67
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
68
69
# If you want feedparser to automatically resolve all relative URIs, set this
70
# to 1.
71
RESOLVE_RELATIVE_URIS = 1
72
73
# If you want feedparser to automatically sanitize all potentially unsafe
74
# HTML content, set this to 1.
75
SANITIZE_HTML = 1
76
77
# ---------- required modules (should come with any Python distribution) ----------
78
import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
79
try:
80
    from cStringIO import StringIO as _StringIO
81
except:
82
    from StringIO import StringIO as _StringIO
83
84
# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
85
86
# gzip is included with most Python distributions, but may not be available if you compiled your own
87
try:
88
    import gzip
89
except:
90
    gzip = None
91
try:
92
    import zlib
93
except:
94
    zlib = None
95
96
# If a real XML parser is available, feedparser will attempt to use it.  feedparser has
97
# been tested with the built-in SAX parser, PyXML, and libxml2.  On platforms where the
98
# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
99
# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
100
try:
101
    import xml.sax
102
    xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
103
    from xml.sax.saxutils import escape as _xmlescape
104
    _XML_AVAILABLE = 1
105
except:
106
    _XML_AVAILABLE = 0
107
    def _xmlescape(data,entities={}):
108
        data = data.replace('&', '&amp;')
109
        data = data.replace('>', '&gt;')
110
        data = data.replace('<', '&lt;')
111
        for char, entity in entities:
112
            data = data.replace(char, entity)
113
        return data
114
115
# base64 support for Atom feeds that contain embedded binary data
116
try:
117
    import base64, binascii
118
except:
119
    base64 = binascii = None
120
121
# cjkcodecs and iconv_codec provide support for more character encodings.
122
# Both are available from http://cjkpython.i18n.org/
123
try:
124
    import cjkcodecs.aliases
125
except:
126
    pass
127
try:
128
    import iconv_codec
129
except:
130
    pass
131
132
# chardet library auto-detects character encodings
133
# Download from http://chardet.feedparser.org/
134
try:
135
    import chardet
136
    if _debug:
137
        import chardet.constants
138
        chardet.constants._debug = 1
139
except:
140
    chardet = None
141
142
# reversable htmlentitydefs mappings for Python 2.2
143
try:
144
  from htmlentitydefs import name2codepoint, codepoint2name
145
except:
146
  import htmlentitydefs
147
  name2codepoint={}
148
  codepoint2name={}
149
  for (name,codepoint) in htmlentitydefs.entitydefs.iteritems():
150
    if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1]))
151
    name2codepoint[name]=ord(codepoint)
152
    codepoint2name[ord(codepoint)]=name
153
154
# BeautifulSoup parser used for parsing microformats from embedded HTML content
155
# http://www.crummy.com/software/BeautifulSoup/
156
# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
157
# older 2.x series.  If it doesn't, and you can figure out why, I'll accept a
158
# patch and modify the compatibility statement accordingly.
159
try:
160
    import BeautifulSoup
161
except:
162
    BeautifulSoup = None
163
164
# ---------- don't touch these ----------
165
class ThingsNobodyCaresAboutButMe(Exception): pass
166
class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
167
class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
168
class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
169
class UndeclaredNamespace(Exception): pass
170
171
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
172
sgmllib.special = re.compile('<!')
173
sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);')
174
175
if sgmllib.endbracket.search(' <').start(0):
176
    class EndBracketMatch:
177
        endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
178
        def search(self,string,index=0):
179
            self.match = self.endbracket.match(string,index)
180
            if self.match: return self
181
        def start(self,n):
182
            return self.match.end(n)
183
    sgmllib.endbracket = EndBracketMatch()
184
185
SUPPORTED_VERSIONS = {'': 'unknown',
186
                      'rss090': 'RSS 0.90',
187
                      'rss091n': 'RSS 0.91 (Netscape)',
188
                      'rss091u': 'RSS 0.91 (Userland)',
189
                      'rss092': 'RSS 0.92',
190
                      'rss093': 'RSS 0.93',
191
                      'rss094': 'RSS 0.94',
192
                      'rss20': 'RSS 2.0',
193
                      'rss10': 'RSS 1.0',
194
                      'rss': 'RSS (unknown version)',
195
                      'atom01': 'Atom 0.1',
196
                      'atom02': 'Atom 0.2',
197
                      'atom03': 'Atom 0.3',
198
                      'atom10': 'Atom 1.0',
199
                      'atom': 'Atom (unknown version)',
200
                      'cdf': 'CDF',
201
                      'hotrss': 'Hot RSS'
202
                      }
203
204
try:
205
    UserDict = dict
206
except NameError:
207
    # Python 2.1 does not have dict
208
    from UserDict import UserDict
209
    def dict(aList):
210
        rc = {}
211
        for k, v in aList:
212
            rc[k] = v
213
        return rc
214
215
class FeedParserDict(UserDict):
216
    keymap = {'channel': 'feed',
217
              'items': 'entries',
218
              'guid': 'id',
219
              'date': 'updated',
220
              'date_parsed': 'updated_parsed',
221
              'description': ['subtitle', 'summary'],
222
              'url': ['href'],
223
              'modified': 'updated',
224
              'modified_parsed': 'updated_parsed',
225
              'issued': 'published',
226
              'issued_parsed': 'published_parsed',
227
              'copyright': 'rights',
228
              'copyright_detail': 'rights_detail',
229
              'tagline': 'subtitle',
230
              'tagline_detail': 'subtitle_detail'}
231
    def __getitem__(self, key):
232
        if key == 'category':
233
            return UserDict.__getitem__(self, 'tags')[0]['term']
234
        if key == 'enclosures':
235
            norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
236
            return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure']
237
        if key == 'license':
238
            for link in UserDict.__getitem__(self, 'links'):
239
                if link['rel']=='license' and link.has_key('href'):
240
                    return link['href']
241
        if key == 'categories':
242
            return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
243
        realkey = self.keymap.get(key, key)
244
        if type(realkey) == types.ListType:
245
            for k in realkey:
246
                if UserDict.has_key(self, k):
247
                    return UserDict.__getitem__(self, k)
248
        if UserDict.has_key(self, key):
249
            return UserDict.__getitem__(self, key)
250
        return UserDict.__getitem__(self, realkey)
251
252
    def __setitem__(self, key, value):
253
        for k in self.keymap.keys():
254
            if key == k:
255
                key = self.keymap[k]
256
                if type(key) == types.ListType:
257
                    key = key[0]
258
        return UserDict.__setitem__(self, key, value)
259
260
    def get(self, key, default=None):
261
        if self.has_key(key):
262
            return self[key]
263
        else:
264
            return default
265
266
    def setdefault(self, key, value):
267
        if not self.has_key(key):
268
            self[key] = value
269
        return self[key]
270
        
271
    def has_key(self, key):
272
        try:
273
            return hasattr(self, key) or UserDict.has_key(self, key)
274
        except AttributeError:
275
            return False
276
        
277
    def __getattr__(self, key):
278
        try:
279
            return self.__dict__[key]
280
        except KeyError:
281
            pass
282
        try:
283
            assert not key.startswith('_')
284
            return self.__getitem__(key)
285
        except:
286
            raise AttributeError, "object has no attribute '%s'" % key
287
288
    def __setattr__(self, key, value):
289
        if key.startswith('_') or key == 'data':
290
            self.__dict__[key] = value
291
        else:
292
            return self.__setitem__(key, value)
293
294
    def __contains__(self, key):
295
        return self.has_key(key)
296
297
def zopeCompatibilityHack():
298
    global FeedParserDict
299
    del FeedParserDict
300
    def FeedParserDict(aDict=None):
301
        rc = {}
302
        if aDict:
303
            rc.update(aDict)
304
        return rc
305
306
_ebcdic_to_ascii_map = None
307
def _ebcdic_to_ascii(s):
308
    global _ebcdic_to_ascii_map
309
    if not _ebcdic_to_ascii_map:
310
        emap = (
311
            0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
312
            16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
313
            128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
314
            144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
315
            32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
316
            38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
317
            45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
318
            186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
319
            195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
320
            202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
321
            209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
322
            216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
323
            123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
324
            125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
325
            92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
326
            48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
327
            )
328
        import string
329
        _ebcdic_to_ascii_map = string.maketrans( \
330
            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
331
    return s.translate(_ebcdic_to_ascii_map)
332
 
333
_cp1252 = {
334
  unichr(128): unichr(8364), # euro sign
335
  unichr(130): unichr(8218), # single low-9 quotation mark
336
  unichr(131): unichr( 402), # latin small letter f with hook
337
  unichr(132): unichr(8222), # double low-9 quotation mark
338
  unichr(133): unichr(8230), # horizontal ellipsis
339
  unichr(134): unichr(8224), # dagger
340
  unichr(135): unichr(8225), # double dagger
341
  unichr(136): unichr( 710), # modifier letter circumflex accent
342
  unichr(137): unichr(8240), # per mille sign
343
  unichr(138): unichr( 352), # latin capital letter s with caron
344
  unichr(139): unichr(8249), # single left-pointing angle quotation mark
345
  unichr(140): unichr( 338), # latin capital ligature oe
346
  unichr(142): unichr( 381), # latin capital letter z with caron
347
  unichr(145): unichr(8216), # left single quotation mark
348
  unichr(146): unichr(8217), # right single quotation mark
349
  unichr(147): unichr(8220), # left double quotation mark
350
  unichr(148): unichr(8221), # right double quotation mark
351
  unichr(149): unichr(8226), # bullet
352
  unichr(150): unichr(8211), # en dash
353
  unichr(151): unichr(8212), # em dash
354
  unichr(152): unichr( 732), # small tilde
355
  unichr(153): unichr(8482), # trade mark sign
356
  unichr(154): unichr( 353), # latin small letter s with caron
357
  unichr(155): unichr(8250), # single right-pointing angle quotation mark
358
  unichr(156): unichr( 339), # latin small ligature oe
359
  unichr(158): unichr( 382), # latin small letter z with caron
360
  unichr(159): unichr( 376)} # latin capital letter y with diaeresis
361
362
_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
363
def _urljoin(base, uri):
364
    uri = _urifixer.sub(r'\1\3', uri)
365
    try:
366
        return urlparse.urljoin(base, uri)
367
    except:
368
        uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
369
        return urlparse.urljoin(base, uri)
370
371
class _FeedParserMixin:
372
    namespaces = {'': '',
373
                  'http://backend.userland.com/rss': '',
374
                  'http://blogs.law.harvard.edu/tech/rss': '',
375
                  'http://purl.org/rss/1.0/': '',
376
                  'http://my.netscape.com/rdf/simple/0.9/': '',
377
                  'http://example.com/newformat#': '',
378
                  'http://example.com/necho': '',
379
                  'http://purl.org/echo/': '',
380
                  'uri/of/echo/namespace#': '',
381
                  'http://purl.org/pie/': '',
382
                  'http://purl.org/atom/ns#': '',
383
                  'http://www.w3.org/2005/Atom': '',
384
                  'http://purl.org/rss/1.0/modules/rss091#': '',
385
                  
386
                  'http://webns.net/mvcb/':                               'admin',
387
                  'http://purl.org/rss/1.0/modules/aggregation/':         'ag',
388
                  'http://purl.org/rss/1.0/modules/annotate/':            'annotate',
389
                  'http://media.tangent.org/rss/1.0/':                    'audio',
390
                  'http://backend.userland.com/blogChannelModule':        'blogChannel',
391
                  'http://web.resource.org/cc/':                          'cc',
392
                  'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
393
                  'http://purl.org/rss/1.0/modules/company':              'co',
394
                  'http://purl.org/rss/1.0/modules/content/':             'content',
395
                  'http://my.theinfo.org/changed/1.0/rss/':               'cp',
396
                  'http://purl.org/dc/elements/1.1/':                     'dc',
397
                  'http://purl.org/dc/terms/':                            'dcterms',
398
                  'http://purl.org/rss/1.0/modules/email/':               'email',
399
                  'http://purl.org/rss/1.0/modules/event/':               'ev',
400
                  'http://rssnamespace.org/feedburner/ext/1.0':           'feedburner',
401
                  'http://freshmeat.net/rss/fm/':                         'fm',
402
                  'http://xmlns.com/foaf/0.1/':                           'foaf',
403
                  'http://www.w3.org/2003/01/geo/wgs84_pos#':             'geo',
404
                  'http://postneo.com/icbm/':                             'icbm',
405
                  'http://purl.org/rss/1.0/modules/image/':               'image',
406
                  'http://www.itunes.com/DTDs/PodCast-1.0.dtd':           'itunes',
407
                  'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
408
                  'http://purl.org/rss/1.0/modules/link/':                'l',
409
                  'http://search.yahoo.com/mrss':                         'media',
410
                  'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
411
                  'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
412
                  'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
413
                  'http://www.w3.org/2000/01/rdf-schema#':                'rdfs',
414
                  'http://purl.org/rss/1.0/modules/reference/':           'ref',
415
                  'http://purl.org/rss/1.0/modules/richequiv/':           'reqv',
416
                  'http://purl.org/rss/1.0/modules/search/':              'search',
417
                  'http://purl.org/rss/1.0/modules/slash/':               'slash',
418
                  'http://schemas.xmlsoap.org/soap/envelope/':            'soap',
419
                  'http://purl.org/rss/1.0/modules/servicestatus/':       'ss',
420
                  'http://hacks.benhammersley.com/rss/streaming/':        'str',
421
                  'http://purl.org/rss/1.0/modules/subscription/':        'sub',
422
                  'http://purl.org/rss/1.0/modules/syndication/':         'sy',
423
                  'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf',
424
                  'http://purl.org/rss/1.0/modules/taxonomy/':            'taxo',
425
                  'http://purl.org/rss/1.0/modules/threading/':           'thr',
426
                  'http://purl.org/rss/1.0/modules/textinput/':           'ti',
427
                  'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
428
                  'http://wellformedweb.org/commentAPI/':                 'wfw',
429
                  'http://purl.org/rss/1.0/modules/wiki/':                'wiki',
430
                  'http://www.w3.org/1999/xhtml':                         'xhtml',
431
                  'http://www.w3.org/1999/xlink':                         'xlink',
432
                  'http://www.w3.org/XML/1998/namespace':                 'xml'
433
}
434
    _matchnamespaces = {}
435
436
    can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon'<