From 02855c7b9c2e20be74ad97fb9d3692380377cf3d Mon Sep 17 00:00:00 2001 From: Ruud Date: Mon, 11 Jun 2012 09:54:15 +0200 Subject: [PATCH] Library update --- couchpotato/core/providers/nzb/mysterbin/main.py | 10 +- couchpotato/core/providers/nzb/nzbclub/main.py | 2 +- couchpotato/core/providers/nzb/nzbindex/main.py | 2 +- .../core/providers/torrent/kickasstorrents/main.py | 12 +- .../core/providers/trailer/hdtrailers/main.py | 14 +- .../core/providers/userscript/allocine/main.py | 2 +- .../providers/userscript/rottentomatoes/main.py | 2 +- libs/BeautifulSoup.py | 2015 ------------ libs/bs4/__init__.py | 355 +++ libs/bs4/builder/__init__.py | 307 ++ libs/bs4/builder/_html5lib.py | 222 ++ libs/bs4/builder/_htmlparser.py | 244 ++ libs/bs4/builder/_lxml.py | 179 ++ libs/bs4/dammit.py | 792 +++++ libs/bs4/element.py | 1347 ++++++++ libs/bs4/testing.py | 515 +++ libs/certifi/__init__.py | 1 + libs/certifi/cacert.pem | 3338 ++++++++++++++++++++ libs/certifi/core.py | 19 + libs/guessit/ISO-3166-1_utf8.txt | 0 libs/guessit/ISO-639-2_utf-8.txt | 0 libs/guessit/__init__.py | 2 +- libs/guessit/country.py | 4 +- libs/guessit/date.py | 0 libs/guessit/fileutils.py | 7 +- libs/guessit/guess.py | 0 libs/guessit/hash_ed2k.py | 0 libs/guessit/hash_mpc.py | 0 libs/guessit/language.py | 81 +- libs/guessit/matcher.py | 0 libs/guessit/matchtree.py | 0 libs/guessit/patterns.py | 6 +- libs/guessit/slogging.py | 0 libs/guessit/textutils.py | 22 + libs/guessit/transfo/__init__.py | 0 libs/guessit/transfo/guess_bonus_features.py | 0 libs/guessit/transfo/guess_date.py | 0 .../transfo/guess_episode_info_from_position.py | 0 libs/guessit/transfo/guess_episodes_rexps.py | 23 +- libs/guessit/transfo/guess_filetype.py | 3 + libs/guessit/transfo/guess_language.py | 0 .../transfo/guess_movie_title_from_position.py | 0 libs/guessit/transfo/guess_properties.py | 0 libs/guessit/transfo/guess_release_group.py | 0 libs/guessit/transfo/guess_video_rexps.py | 0 libs/guessit/transfo/guess_weak_episodes_rexps.py | 0 libs/guessit/transfo/guess_website.py | 0 libs/guessit/transfo/guess_year.py | 0 libs/guessit/transfo/post_process.py | 0 libs/guessit/transfo/split_explicit_groups.py | 0 libs/guessit/transfo/split_on_dash.py | 0 libs/guessit/transfo/split_path_components.py | 0 libs/html5lib/__init__.py | 17 + libs/html5lib/constants.py | 3085 ++++++++++++++++++ libs/html5lib/filters/__init__.py | 0 libs/html5lib/filters/_base.py | 10 + libs/html5lib/filters/formfiller.py | 127 + libs/html5lib/filters/inject_meta_charset.py | 62 + libs/html5lib/filters/lint.py | 88 + libs/html5lib/filters/optionaltags.py | 202 ++ libs/html5lib/filters/sanitizer.py | 8 + libs/html5lib/filters/whitespace.py | 41 + libs/html5lib/html5parser.py | 2733 ++++++++++++++++ libs/html5lib/ihatexml.py | 177 ++ libs/html5lib/inputstream.py | 782 +++++ libs/html5lib/sanitizer.py | 258 ++ libs/html5lib/serializer/__init__.py | 17 + libs/html5lib/serializer/htmlserializer.py | 312 ++ libs/html5lib/serializer/xhtmlserializer.py | 9 + libs/html5lib/tokenizer.py | 1744 ++++++++++ libs/html5lib/treebuilders/__init__.py | 96 + libs/html5lib/treebuilders/_base.py | 377 +++ libs/html5lib/treebuilders/dom.py | 291 ++ libs/html5lib/treebuilders/etree.py | 344 ++ libs/html5lib/treebuilders/etree_lxml.py | 336 ++ libs/html5lib/treebuilders/simpletree.py | 256 ++ libs/html5lib/treebuilders/soup.py | 236 ++ libs/html5lib/treewalkers/__init__.py | 52 + libs/html5lib/treewalkers/_base.py | 176 ++ libs/html5lib/treewalkers/dom.py | 41 + libs/html5lib/treewalkers/etree.py | 141 + libs/html5lib/treewalkers/genshistream.py | 70 + libs/html5lib/treewalkers/lxmletree.py | 186 ++ libs/html5lib/treewalkers/pulldom.py | 60 + libs/html5lib/treewalkers/simpletree.py | 78 + libs/html5lib/treewalkers/soup.py | 60 + libs/html5lib/utils.py | 175 + libs/oauthlib/__init__.py | 0 libs/oauthlib/common.py | 155 + libs/oauthlib/oauth1/__init__.py | 13 + libs/oauthlib/oauth1/rfc5849/__init__.py | 350 ++ libs/oauthlib/oauth1/rfc5849/parameters.py | 134 + libs/oauthlib/oauth1/rfc5849/signature.py | 501 +++ libs/oauthlib/oauth1/rfc5849/utils.py | 141 + libs/oauthlib/oauth2/__init__.py | 13 + libs/oauthlib/oauth2/draft25/__init__.py | 14 + libs/oauthlib/oauth2/draft25/tokens.py | 131 + libs/oauthlib/oauth2/draft25/utils.py | 128 + libs/subliminal/api.py | 17 +- libs/subliminal/async.py | 14 +- libs/subliminal/cache.py | 132 + libs/subliminal/core.py | 75 +- libs/subliminal/infos.py | 2 +- libs/subliminal/languages.py | 547 ---- libs/subliminal/services/__init__.py | 171 +- libs/subliminal/services/addic7ed.py | 161 + libs/subliminal/services/bierdopje.py | 78 +- libs/subliminal/services/opensubtitles.py | 105 +- libs/subliminal/services/podnapisi.py | 106 + libs/subliminal/services/subswiki.py | 23 +- libs/subliminal/services/subtitulos.py | 31 +- libs/subliminal/services/thesubdb.py | 25 +- libs/subliminal/services/tvsubtitles.py | 146 + libs/subliminal/subtitles.py | 36 +- libs/subliminal/videos.py | 33 +- 115 files changed, 22575 insertions(+), 2880 deletions(-) delete mode 100644 libs/BeautifulSoup.py create mode 100644 libs/bs4/__init__.py create mode 100644 libs/bs4/builder/__init__.py create mode 100644 libs/bs4/builder/_html5lib.py create mode 100644 libs/bs4/builder/_htmlparser.py create mode 100644 libs/bs4/builder/_lxml.py create mode 100644 libs/bs4/dammit.py create mode 100644 libs/bs4/element.py create mode 100644 libs/bs4/testing.py create mode 100644 libs/certifi/__init__.py create mode 100644 libs/certifi/cacert.pem create mode 100644 libs/certifi/core.py mode change 100644 => 100755 libs/guessit/ISO-3166-1_utf8.txt mode change 100644 => 100755 libs/guessit/ISO-639-2_utf-8.txt mode change 100644 => 100755 libs/guessit/__init__.py mode change 100644 => 100755 libs/guessit/country.py mode change 100644 => 100755 libs/guessit/date.py mode change 100644 => 100755 libs/guessit/fileutils.py mode change 100644 => 100755 libs/guessit/guess.py mode change 100644 => 100755 libs/guessit/hash_ed2k.py mode change 100644 => 100755 libs/guessit/hash_mpc.py mode change 100644 => 100755 libs/guessit/language.py mode change 100644 => 100755 libs/guessit/matcher.py mode change 100644 => 100755 libs/guessit/matchtree.py mode change 100644 => 100755 libs/guessit/slogging.py mode change 100644 => 100755 libs/guessit/textutils.py mode change 100644 => 100755 libs/guessit/transfo/__init__.py mode change 100644 => 100755 libs/guessit/transfo/guess_bonus_features.py mode change 100644 => 100755 libs/guessit/transfo/guess_date.py mode change 100644 => 100755 libs/guessit/transfo/guess_episode_info_from_position.py mode change 100644 => 100755 libs/guessit/transfo/guess_episodes_rexps.py mode change 100644 => 100755 libs/guessit/transfo/guess_filetype.py mode change 100644 => 100755 libs/guessit/transfo/guess_language.py mode change 100644 => 100755 libs/guessit/transfo/guess_movie_title_from_position.py mode change 100644 => 100755 libs/guessit/transfo/guess_properties.py mode change 100644 => 100755 libs/guessit/transfo/guess_release_group.py mode change 100644 => 100755 libs/guessit/transfo/guess_video_rexps.py mode change 100644 => 100755 libs/guessit/transfo/guess_weak_episodes_rexps.py mode change 100644 => 100755 libs/guessit/transfo/guess_website.py mode change 100644 => 100755 libs/guessit/transfo/guess_year.py mode change 100644 => 100755 libs/guessit/transfo/post_process.py mode change 100644 => 100755 libs/guessit/transfo/split_explicit_groups.py mode change 100644 => 100755 libs/guessit/transfo/split_on_dash.py mode change 100644 => 100755 libs/guessit/transfo/split_path_components.py create mode 100644 libs/html5lib/__init__.py create mode 100644 libs/html5lib/constants.py create mode 100644 libs/html5lib/filters/__init__.py create mode 100644 libs/html5lib/filters/_base.py create mode 100644 libs/html5lib/filters/formfiller.py create mode 100644 libs/html5lib/filters/inject_meta_charset.py create mode 100644 libs/html5lib/filters/lint.py create mode 100644 libs/html5lib/filters/optionaltags.py create mode 100644 libs/html5lib/filters/sanitizer.py create mode 100644 libs/html5lib/filters/whitespace.py create mode 100644 libs/html5lib/html5parser.py create mode 100644 libs/html5lib/ihatexml.py create mode 100644 libs/html5lib/inputstream.py create mode 100644 libs/html5lib/sanitizer.py create mode 100644 libs/html5lib/serializer/__init__.py create mode 100644 libs/html5lib/serializer/htmlserializer.py create mode 100644 libs/html5lib/serializer/xhtmlserializer.py create mode 100644 libs/html5lib/tokenizer.py create mode 100755 libs/html5lib/treebuilders/__init__.py create mode 100755 libs/html5lib/treebuilders/_base.py create mode 100644 libs/html5lib/treebuilders/dom.py create mode 100755 libs/html5lib/treebuilders/etree.py create mode 100644 libs/html5lib/treebuilders/etree_lxml.py create mode 100755 libs/html5lib/treebuilders/simpletree.py create mode 100644 libs/html5lib/treebuilders/soup.py create mode 100644 libs/html5lib/treewalkers/__init__.py create mode 100644 libs/html5lib/treewalkers/_base.py create mode 100644 libs/html5lib/treewalkers/dom.py create mode 100644 libs/html5lib/treewalkers/etree.py create mode 100644 libs/html5lib/treewalkers/genshistream.py create mode 100644 libs/html5lib/treewalkers/lxmletree.py create mode 100644 libs/html5lib/treewalkers/pulldom.py create mode 100644 libs/html5lib/treewalkers/simpletree.py create mode 100644 libs/html5lib/treewalkers/soup.py create mode 100644 libs/html5lib/utils.py create mode 100644 libs/oauthlib/__init__.py create mode 100644 libs/oauthlib/common.py create mode 100644 libs/oauthlib/oauth1/__init__.py create mode 100644 libs/oauthlib/oauth1/rfc5849/__init__.py create mode 100644 libs/oauthlib/oauth1/rfc5849/parameters.py create mode 100644 libs/oauthlib/oauth1/rfc5849/signature.py create mode 100644 libs/oauthlib/oauth1/rfc5849/utils.py create mode 100644 libs/oauthlib/oauth2/__init__.py create mode 100644 libs/oauthlib/oauth2/draft25/__init__.py create mode 100644 libs/oauthlib/oauth2/draft25/tokens.py create mode 100644 libs/oauthlib/oauth2/draft25/utils.py create mode 100755 libs/subliminal/cache.py delete mode 100755 libs/subliminal/languages.py create mode 100755 libs/subliminal/services/addic7ed.py create mode 100755 libs/subliminal/services/podnapisi.py create mode 100755 libs/subliminal/services/tvsubtitles.py diff --git a/couchpotato/core/providers/nzb/mysterbin/main.py b/couchpotato/core/providers/nzb/mysterbin/main.py index 5e61979..008f24f 100644 --- a/couchpotato/core/providers/nzb/mysterbin/main.py +++ b/couchpotato/core/providers/nzb/mysterbin/main.py @@ -1,4 +1,4 @@ -from BeautifulSoup import BeautifulSoup +from bs4 import BeautifulSoup from couchpotato.core.event import fireEvent from couchpotato.core.helpers.encoding import toUnicode, tryUrlencode, \ simplifyString @@ -49,21 +49,21 @@ class Mysterbin(NZBProvider): try: html = BeautifulSoup(data) resultable = html.find('table', attrs = {'class':'t'}) - for result in resultable.findAll('tr'): + for result in resultable.find_all('tr'): try: myster_id = result.find('input', attrs = {'class': 'check4nzb'})['value'] # Age age = '' - for temp in result.find('td', attrs = {'class': 'cdetail'}).findAll(text = True): + for temp in result.find('td', attrs = {'class': 'cdetail'}).find_all(text = True): if 'days' in temp: age = tryInt(temp.split(' ')[0]) break # size size = None - for temp in result.find('div', attrs = {'class': 'cdetail'}).findAll(text = True): + for temp in result.find('div', attrs = {'class': 'cdetail'}).find_all(text = True): if 'gb' in temp.lower() or 'mb' in temp.lower() or 'kb' in temp.lower(): size = self.parseSize(temp) break @@ -74,7 +74,7 @@ class Mysterbin(NZBProvider): new = { 'id': myster_id, - 'name': ''.join(result.find('span', attrs = {'class': 'cname'}).findAll(text = True)), + 'name': ''.join(result.find('span', attrs = {'class': 'cname'}).find_all(text = True)), 'type': 'nzb', 'provider': self.getName(), 'age': age, diff --git a/couchpotato/core/providers/nzb/nzbclub/main.py b/couchpotato/core/providers/nzb/nzbclub/main.py index e6dbad9..3632c32 100644 --- a/couchpotato/core/providers/nzb/nzbclub/main.py +++ b/couchpotato/core/providers/nzb/nzbclub/main.py @@ -1,4 +1,4 @@ -from BeautifulSoup import BeautifulSoup +from bs4 import BeautifulSoup from couchpotato.core.event import fireEvent from couchpotato.core.helpers.encoding import toUnicode, tryUrlencode, \ simplifyString diff --git a/couchpotato/core/providers/nzb/nzbindex/main.py b/couchpotato/core/providers/nzb/nzbindex/main.py index fd53cdd..5f74c09 100644 --- a/couchpotato/core/providers/nzb/nzbindex/main.py +++ b/couchpotato/core/providers/nzb/nzbindex/main.py @@ -1,4 +1,4 @@ -from BeautifulSoup import BeautifulSoup +from bs4 import BeautifulSoup from couchpotato.core.event import fireEvent from couchpotato.core.helpers.encoding import toUnicode, tryUrlencode, \ simplifyString diff --git a/couchpotato/core/providers/torrent/kickasstorrents/main.py b/couchpotato/core/providers/torrent/kickasstorrents/main.py index 1101d32..2d82a4d 100644 --- a/couchpotato/core/providers/torrent/kickasstorrents/main.py +++ b/couchpotato/core/providers/torrent/kickasstorrents/main.py @@ -1,4 +1,4 @@ -from BeautifulSoup import BeautifulSoup +from bs4 import BeautifulSoup from couchpotato.core.event import fireEvent from couchpotato.core.helpers.variable import tryInt, getTitle from couchpotato.core.logger import CPLog @@ -47,14 +47,14 @@ class KickAssTorrents(TorrentProvider): try: html = BeautifulSoup(data) resultdiv = html.find('div', attrs = {'class':'tabs'}) - for result in resultdiv.findAll('div', recursive = False): + for result in resultdiv.find_all('div', recursive = False): if result.get('id').lower() not in cat_ids: continue try: try: - for temp in result.findAll('tr'): + for temp in result.find_all('tr'): if temp['class'] is 'firstr' or not temp.get('id'): continue @@ -68,15 +68,15 @@ class KickAssTorrents(TorrentProvider): } nr = 0 - for td in temp.findAll('td'): + for td in temp.find_all('td'): column_name = table_order[nr] if column_name: if column_name is 'name': - link = td.find('div', {'class': 'torrentname'}).findAll('a')[1] + link = td.find('div', {'class': 'torrentname'}).find_all('a')[1] new['id'] = temp.get('id')[-8:] new['name'] = link.text - new['url'] = td.findAll('a', 'idownload')[1]['href'] + new['url'] = td.find_all('a', 'idownload')[1]['href'] if new['url'][:2] == '//': new['url'] = 'http:%s' % new['url'] new['score'] = 20 if td.find('a', 'iverif') else 0 diff --git a/couchpotato/core/providers/trailer/hdtrailers/main.py b/couchpotato/core/providers/trailer/hdtrailers/main.py index b68f76f..642079f 100644 --- a/couchpotato/core/providers/trailer/hdtrailers/main.py +++ b/couchpotato/core/providers/trailer/hdtrailers/main.py @@ -1,4 +1,4 @@ -from BeautifulSoup import SoupStrainer, BeautifulSoup +from bs4 import SoupStrainer, BeautifulSoup from couchpotato.core.helpers.encoding import tryUrlencode from couchpotato.core.helpers.variable import mergeDicts, getTitle from couchpotato.core.logger import CPLog @@ -51,13 +51,13 @@ class HDTrailers(TrailerProvider): try: tables = SoupStrainer('div') - html = BeautifulSoup(data, parseOnlyThese = tables) - result_table = html.findAll('h2', text = re.compile(movie_name)) + html = BeautifulSoup(data, parse_only = tables) + result_table = html.find_all('h2', text = re.compile(movie_name)) for h2 in result_table: if 'trailer' in h2.lower(): parent = h2.parent.parent.parent - trailerLinks = parent.findAll('a', text = re.compile('480p|720p|1080p')) + trailerLinks = parent.find_all('a', text = re.compile('480p|720p|1080p')) try: for trailer in trailerLinks: results[trailer].insert(0, trailer.parent['href']) @@ -74,11 +74,11 @@ class HDTrailers(TrailerProvider): results = {'480p':[], '720p':[], '1080p':[]} try: tables = SoupStrainer('table') - html = BeautifulSoup(data, parseOnlyThese = tables) + html = BeautifulSoup(data, parse_only = tables) result_table = html.find('table', attrs = {'class':'bottomTable'}) - for tr in result_table.findAll('tr'): + for tr in result_table.find_all('tr'): trtext = str(tr).lower() if 'clips' in trtext: break @@ -86,7 +86,7 @@ class HDTrailers(TrailerProvider): nr = 0 if 'trailer' not in tr.find('span', 'standardTrailerName').text.lower(): continue - resolutions = tr.findAll('td', attrs = {'class':'bottomTableResolution'}) + resolutions = tr.find_all('td', attrs = {'class':'bottomTableResolution'}) for res in resolutions: results[str(res.a.contents[0])].insert(0, res.a['href']) nr += 1 diff --git a/couchpotato/core/providers/userscript/allocine/main.py b/couchpotato/core/providers/userscript/allocine/main.py index 8213ac2..890ae22 100644 --- a/couchpotato/core/providers/userscript/allocine/main.py +++ b/couchpotato/core/providers/userscript/allocine/main.py @@ -1,4 +1,4 @@ -from BeautifulSoup import BeautifulSoup +from bs4 import BeautifulSoup from couchpotato.core.providers.userscript.base import UserscriptBase class AlloCine(UserscriptBase): diff --git a/couchpotato/core/providers/userscript/rottentomatoes/main.py b/couchpotato/core/providers/userscript/rottentomatoes/main.py index 1d68590..cd869b8 100644 --- a/couchpotato/core/providers/userscript/rottentomatoes/main.py +++ b/couchpotato/core/providers/userscript/rottentomatoes/main.py @@ -1,4 +1,4 @@ -from BeautifulSoup import BeautifulSoup +from bs4 import BeautifulSoup from couchpotato.core.event import fireEvent from couchpotato.core.providers.userscript.base import UserscriptBase diff --git a/libs/BeautifulSoup.py b/libs/BeautifulSoup.py deleted file mode 100644 index addd35d..0000000 --- a/libs/BeautifulSoup.py +++ /dev/null @@ -1,2015 +0,0 @@ -"""Beautiful Soup -Elixir and Tonic -"The Screen-Scraper's Friend" -http://www.crummy.com/software/BeautifulSoup/ - -Beautiful Soup parses a (possibly invalid) XML or HTML document into a -tree representation. It provides methods and Pythonic idioms that make -it easy to navigate, search, and modify the tree. - -A well-formed XML/HTML document yields a well-formed data -structure. An ill-formed XML/HTML document yields a correspondingly -ill-formed data structure. If your document is only locally -well-formed, you can use this library to find and process the -well-formed part of it. - -Beautiful Soup works with Python 2.2 and up. It has no external -dependencies, but you'll have more success at converting data to UTF-8 -if you also install these three packages: - -* chardet, for auto-detecting character encodings - http://chardet.feedparser.org/ -* cjkcodecs and iconv_codec, which add more encodings to the ones supported - by stock Python. - http://cjkpython.i18n.org/ - -Beautiful Soup defines classes for two main parsing strategies: - - * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific - language that kind of looks like XML. - - * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid - or invalid. This class has web browser-like heuristics for - obtaining a sensible parse tree in the face of common HTML errors. - -Beautiful Soup also defines a class (UnicodeDammit) for autodetecting -the encoding of an HTML or XML document, and converting it to -Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. - -For more than you ever wanted to know about Beautiful Soup, see the -documentation: -http://www.crummy.com/software/BeautifulSoup/documentation.html - -Here, have some legalese: - -Copyright (c) 2004-2010, Leonard Richardson - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - * Neither the name of the the Beautiful Soup Consortium and All - Night Kosher Bakery nor the names of its contributors may be - used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. - -""" -from __future__ import generators - -__author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "3.2.0" -__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson" -__license__ = "New-style BSD" - -from sgmllib import SGMLParser, SGMLParseError -import codecs -import markupbase -import types -import re -import sgmllib -try: - from htmlentitydefs import name2codepoint -except ImportError: - name2codepoint = {} -try: - set -except NameError: - from sets import Set as set - -#These hacks make Beautiful Soup able to parse XML with namespaces -sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') -markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match - -DEFAULT_OUTPUT_ENCODING = "utf-8" - -def _match_css_class(str): - """Build a RE to match the given CSS class.""" - return re.compile(r"(^|.*\s)%s($|\s)" % str) - -# First, the classes that represent markup elements. - -class PageElement(object): - """Contains the navigational information for some part of the page - (either a tag or a piece of text)""" - - def setup(self, parent = None, previous = None): - """Sets up the initial relations between this element and - other elements.""" - self.parent = parent - self.previous = previous - self.next = None - self.previousSibling = None - self.nextSibling = None - if self.parent and self.parent.contents: - self.previousSibling = self.parent.contents[-1] - self.previousSibling.nextSibling = self - - def replaceWith(self, replaceWith): - oldParent = self.parent - myIndex = self.parent.index(self) - if hasattr(replaceWith, "parent")\ - and replaceWith.parent is self.parent: - # We're replacing this element with one of its siblings. - index = replaceWith.parent.index(replaceWith) - if index and index < myIndex: - # Furthermore, it comes before this element. That - # means that when we extract it, the index of this - # element will change. - myIndex = myIndex - 1 - self.extract() - oldParent.insert(myIndex, replaceWith) - - def replaceWithChildren(self): - myParent = self.parent - myIndex = self.parent.index(self) - self.extract() - reversedChildren = list(self.contents) - reversedChildren.reverse() - for child in reversedChildren: - myParent.insert(myIndex, child) - - def extract(self): - """Destructively rips this element out of the tree.""" - if self.parent: - try: - del self.parent.contents[self.parent.index(self)] - except ValueError: - pass - - #Find the two elements that would be next to each other if - #this element (and any children) hadn't been parsed. Connect - #the two. - lastChild = self._lastRecursiveChild() - nextElement = lastChild.next - - if self.previous: - self.previous.next = nextElement - if nextElement: - nextElement.previous = self.previous - self.previous = None - lastChild.next = None - - self.parent = None - if self.previousSibling: - self.previousSibling.nextSibling = self.nextSibling - if self.nextSibling: - self.nextSibling.previousSibling = self.previousSibling - self.previousSibling = self.nextSibling = None - return self - - def _lastRecursiveChild(self): - "Finds the last element beneath this object to be parsed." - lastChild = self - while hasattr(lastChild, 'contents') and lastChild.contents: - lastChild = lastChild.contents[-1] - return lastChild - - def insert(self, position, newChild): - if isinstance(newChild, basestring) \ - and not isinstance(newChild, NavigableString): - newChild = NavigableString(newChild) - - position = min(position, len(self.contents)) - if hasattr(newChild, 'parent') and newChild.parent is not None: - # We're 'inserting' an element that's already one - # of this object's children. - if newChild.parent is self: - index = self.index(newChild) - if index > position: - # Furthermore we're moving it further down the - # list of this object's children. That means that - # when we extract this element, our target index - # will jump down one. - position = position - 1 - newChild.extract() - - newChild.parent = self - previousChild = None - if position == 0: - newChild.previousSibling = None - newChild.previous = self - else: - previousChild = self.contents[position - 1] - newChild.previousSibling = previousChild - newChild.previousSibling.nextSibling = newChild - newChild.previous = previousChild._lastRecursiveChild() - if newChild.previous: - newChild.previous.next = newChild - - newChildsLastElement = newChild._lastRecursiveChild() - - if position >= len(self.contents): - newChild.nextSibling = None - - parent = self - parentsNextSibling = None - while not parentsNextSibling: - parentsNextSibling = parent.nextSibling - parent = parent.parent - if not parent: # This is the last element in the document. - break - if parentsNextSibling: - newChildsLastElement.next = parentsNextSibling - else: - newChildsLastElement.next = None - else: - nextChild = self.contents[position] - newChild.nextSibling = nextChild - if newChild.nextSibling: - newChild.nextSibling.previousSibling = newChild - newChildsLastElement.next = nextChild - - if newChildsLastElement.next: - newChildsLastElement.next.previous = newChildsLastElement - self.contents.insert(position, newChild) - - def append(self, tag): - """Appends the given tag to the contents of this tag.""" - self.insert(len(self.contents), tag) - - def findNext(self, name = None, attrs = {}, text = None, **kwargs): - """Returns the first item that matches the given criteria and - appears after this Tag in the document.""" - return self._findOne(self.findAllNext, name, attrs, text, **kwargs) - - def findAllNext(self, name = None, attrs = {}, text = None, limit = None, - **kwargs): - """Returns all items that match the given criteria and appear - after this Tag in the document.""" - return self._findAll(name, attrs, text, limit, self.nextGenerator, - **kwargs) - - def findNextSibling(self, name = None, attrs = {}, text = None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears after this Tag in the document.""" - return self._findOne(self.findNextSiblings, name, attrs, text, - **kwargs) - - def findNextSiblings(self, name = None, attrs = {}, text = None, limit = None, - **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear after this Tag in the document.""" - return self._findAll(name, attrs, text, limit, - self.nextSiblingGenerator, **kwargs) - fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x - - def findPrevious(self, name = None, attrs = {}, text = None, **kwargs): - """Returns the first item that matches the given criteria and - appears before this Tag in the document.""" - return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) - - def findAllPrevious(self, name = None, attrs = {}, text = None, limit = None, - **kwargs): - """Returns all items that match the given criteria and appear - before this Tag in the document.""" - return self._findAll(name, attrs, text, limit, self.previousGenerator, - **kwargs) - fetchPrevious = findAllPrevious # Compatibility with pre-3.x - - def findPreviousSibling(self, name = None, attrs = {}, text = None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears before this Tag in the document.""" - return self._findOne(self.findPreviousSiblings, name, attrs, text, - **kwargs) - - def findPreviousSiblings(self, name = None, attrs = {}, text = None, - limit = None, **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear before this Tag in the document.""" - return self._findAll(name, attrs, text, limit, - self.previousSiblingGenerator, **kwargs) - fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x - - def findParent(self, name = None, attrs = {}, **kwargs): - """Returns the closest parent of this Tag that matches the given - criteria.""" - # NOTE: We can't use _findOne because findParents takes a different - # set of arguments. - r = None - l = self.findParents(name, attrs, 1) - if l: - r = l[0] - return r - - def findParents(self, name = None, attrs = {}, limit = None, **kwargs): - """Returns the parents of this Tag that match the given - criteria.""" - - return self._findAll(name, attrs, None, limit, self.parentGenerator, - **kwargs) - fetchParents = findParents # Compatibility with pre-3.x - - #These methods do the real heavy lifting. - - def _findOne(self, method, name, attrs, text, **kwargs): - r = None - l = method(name, attrs, text, 1, **kwargs) - if l: - r = l[0] - return r - - def _findAll(self, name, attrs, text, limit, generator, **kwargs): - "Iterates over a generator looking for things that match." - - if isinstance(name, SoupStrainer): - strainer = name - # (Possibly) special case some findAll*(...) searches - elif text is None and not limit and not attrs and not kwargs: - # findAll*(True) - if name is True: - return [element for element in generator() - if isinstance(element, Tag)] - # findAll*('tag-name') - elif isinstance(name, basestring): - return [element for element in generator() - if isinstance(element, Tag) and - element.name == name] - else: - strainer = SoupStrainer(name, attrs, text, **kwargs) - # Build a SoupStrainer - else: - strainer = SoupStrainer(name, attrs, text, **kwargs) - results = ResultSet(strainer) - g = generator() - while True: - try: - i = g.next() - except StopIteration: - break - if i: - found = strainer.search(i) - if found: - results.append(found) - if limit and len(results) >= limit: - break - return results - - #These Generators can be used to navigate starting from both - #NavigableStrings and Tags. - def nextGenerator(self): - i = self - while i is not None: - i = i.next - yield i - - def nextSiblingGenerator(self): - i = self - while i is not None: - i = i.nextSibling - yield i - - def previousGenerator(self): - i = self - while i is not None: - i = i.previous - yield i - - def previousSiblingGenerator(self): - i = self - while i is not None: - i = i.previousSibling - yield i - - def parentGenerator(self): - i = self - while i is not None: - i = i.parent - yield i - - # Utility methods - def substituteEncoding(self, str, encoding = None): - encoding = encoding or "utf-8" - return str.replace("%SOUP-ENCODING%", encoding) - - def toEncoding(self, s, encoding = None): - """Encodes an object to a string in some encoding, or to Unicode. - .""" - if isinstance(s, unicode): - if encoding: - s = s.encode(encoding) - elif isinstance(s, str): - if encoding: - s = s.encode(encoding) - else: - s = unicode(s) - else: - if encoding: - s = self.toEncoding(str(s), encoding) - else: - s = unicode(s) - return s - -class NavigableString(unicode, PageElement): - - def __new__(cls, value): - """Create a new NavigableString. - - When unpickling a NavigableString, this method is called with - the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be - passed in to the superclass's __new__ or the superclass won't know - how to handle non-ASCII characters. - """ - if isinstance(value, unicode): - return unicode.__new__(cls, value) - return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) - - def __getnewargs__(self): - return (NavigableString.__str__(self),) - - def __getattr__(self, attr): - """text.string gives you text. This is for backwards - compatibility for Navigable*String, but for CData* it lets you - get the string without the CData wrapper.""" - if attr == 'string': - return self - else: - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) - - def __unicode__(self): - return str(self).decode(DEFAULT_OUTPUT_ENCODING) - - def __str__(self, encoding = DEFAULT_OUTPUT_ENCODING): - if encoding: - return self.encode(encoding) - else: - return self - -class CData(NavigableString): - - def __str__(self, encoding = DEFAULT_OUTPUT_ENCODING): - return "" % NavigableString.__str__(self, encoding) - -class ProcessingInstruction(NavigableString): - def __str__(self, encoding = DEFAULT_OUTPUT_ENCODING): - output = self - if "%SOUP-ENCODING%" in output: - output = self.substituteEncoding(output, encoding) - return "" % self.toEncoding(output, encoding) - -class Comment(NavigableString): - def __str__(self, encoding = DEFAULT_OUTPUT_ENCODING): - return "" % NavigableString.__str__(self, encoding) - -class Declaration(NavigableString): - def __str__(self, encoding = DEFAULT_OUTPUT_ENCODING): - return "" % NavigableString.__str__(self, encoding) - -class Tag(PageElement): - - """Represents a found HTML tag with its attributes and contents.""" - - def _invert(h): - "Cheap function to invert a hash." - i = {} - for k, v in h.items(): - i[v] = k - return i - - XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", - "quot" : '"', - "amp" : "&", - "lt" : "<", - "gt" : ">" } - - XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) - - def _convertEntities(self, match): - """Used in a call to re.sub to replace HTML, XML, and numeric - entities with the appropriate Unicode characters. If HTML - entities are being converted, any unrecognized entities are - escaped.""" - x = match.group(1) - if self.convertHTMLEntities and x in name2codepoint: - return unichr(name2codepoint[x]) - elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: - if self.convertXMLEntities: - return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] - else: - return u'&%s;' % x - elif len(x) > 0 and x[0] == '#': - # Handle numeric entities - if len(x) > 1 and x[1] == 'x': - return unichr(int(x[2:], 16)) - else: - return unichr(int(x[1:])) - - elif self.escapeUnrecognizedEntities: - return u'&%s;' % x - else: - return u'&%s;' % x - - def __init__(self, parser, name, attrs = None, parent = None, - previous = None): - "Basic constructor." - - # We don't actually store the parser object: that lets extracted - # chunks be garbage-collected - self.parserClass = parser.__class__ - self.isSelfClosing = parser.isSelfClosingTag(name) - self.name = name - if attrs is None: - attrs = [] - elif isinstance(attrs, dict): - attrs = attrs.items() - self.attrs = attrs - self.contents = [] - self.setup(parent, previous) - self.hidden = False - self.containsSubstitutions = False - self.convertHTMLEntities = parser.convertHTMLEntities - self.convertXMLEntities = parser.convertXMLEntities - self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities - - # Convert any HTML, XML, or numeric entities in the attribute values. - convert = lambda(k, val): (k, - re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", - self._convertEntities, - val)) - self.attrs = map(convert, self.attrs) - - def getString(self): - if (len(self.contents) == 1 - and isinstance(self.contents[0], NavigableString)): - return self.contents[0] - - def setString(self, string): - """Replace the contents of the tag with a string""" - self.clear() - self.append(string) - - string = property(getString, setString) - - def getText(self, separator = u""): - if not len(self.contents): - return u"" - stopNode = self._lastRecursiveChild().next - strings = [] - current = self.contents[0] - while current is not stopNode: - if isinstance(current, NavigableString): - strings.append(current.strip()) - current = current.next - return separator.join(strings) - - text = property(getText) - - def get(self, key, default = None): - """Returns the value of the 'key' attribute for the tag, or - the value given for 'default' if it doesn't have that - attribute.""" - return self._getAttrMap().get(key, default) - - def clear(self): - """Extract all children.""" - for child in self.contents[:]: - child.extract() - - def index(self, element): - for i, child in enumerate(self.contents): - if child is element: - return i - raise ValueError("Tag.index: element not in tag") - - def has_key(self, key): - return self._getAttrMap().has_key(key) - - def __getitem__(self, key): - """tag[key] returns the value of the 'key' attribute for the tag, - and throws an exception if it's not there.""" - return self._getAttrMap()[key] - - def __iter__(self): - "Iterating over a tag iterates over its contents." - return iter(self.contents) - - def __len__(self): - "The length of a tag is the length of its list of contents." - return len(self.contents) - - def __contains__(self, x): - return x in self.contents - - def __nonzero__(self): - "A tag is non-None even if it has no contents." - return True - - def __setitem__(self, key, value): - """Setting tag[key] sets the value of the 'key' attribute for the - tag.""" - self._getAttrMap() - self.attrMap[key] = value - found = False - for i in range(0, len(self.attrs)): - if self.attrs[i][0] == key: - self.attrs[i] = (key, value) - found = True - if not found: - self.attrs.append((key, value)) - self._getAttrMap()[key] = value - - def __delitem__(self, key): - "Deleting tag[key] deletes all 'key' attributes for the tag." - for item in self.attrs: - if item[0] == key: - self.attrs.remove(item) - #We don't break because bad HTML can define the same - #attribute multiple times. - self._getAttrMap() - if self.attrMap.has_key(key): - del self.attrMap[key] - - def __call__(self, *args, **kwargs): - """Calling a tag like a function is the same as calling its - findAll() method. Eg. tag('a') returns a list of all the A tags - found within this tag.""" - return apply(self.findAll, args, kwargs) - - def __getattr__(self, tag): - #print "Getattr %s.%s" % (self.__class__, tag) - if len(tag) > 3 and tag.rfind('Tag') == len(tag) - 3: - return self.find(tag[:-3]) - elif tag.find('__') != 0: - return self.find(tag) - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) - - def __eq__(self, other): - """Returns true iff this tag has the same name, the same attributes, - and the same contents (recursively) as the given tag. - - NOTE: right now this will return false if two tags have the - same attributes in a different order. Should this be fixed?""" - if other is self: - return True - if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): - return False - for i in range(0, len(self.contents)): - if self.contents[i] != other.contents[i]: - return False - return True - - def __ne__(self, other): - """Returns true iff this tag is not identical to the other tag, - as defined in __eq__.""" - return not self == other - - def __repr__(self, encoding = DEFAULT_OUTPUT_ENCODING): - """Renders this tag as a string.""" - return self.__str__(encoding) - - def __unicode__(self): - return self.__str__(None) - - BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" - + ")") - - def _sub_entity(self, x): - """Used with a regular expression to substitute the - appropriate XML entity for an XML special character.""" - return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" - - def __str__(self, encoding = DEFAULT_OUTPUT_ENCODING, - prettyPrint = False, indentLevel = 0): - """Returns a string or Unicode representation of this tag and - its contents. To get Unicode, pass None for encoding. - - NOTE: since Python's HTML parser consumes whitespace, this - method is not certain to reproduce the whitespace present in - the original string.""" - - encodedName = self.toEncoding(self.name, encoding) - - attrs = [] - if self.attrs: - for key, val in self.attrs: - fmt = '%s="%s"' - if isinstance(val, basestring): - if self.containsSubstitutions and '%SOUP-ENCODING%' in val: - val = self.substituteEncoding(val, encoding) - - # The attribute value either: - # - # * Contains no embedded double quotes or single quotes. - # No problem: we enclose it in double quotes. - # * Contains embedded single quotes. No problem: - # double quotes work here too. - # * Contains embedded double quotes. No problem: - # we enclose it in single quotes. - # * Embeds both single _and_ double quotes. This - # can't happen naturally, but it can happen if - # you modify an attribute value after parsing - # the document. Now we have a bit of a - # problem. We solve it by enclosing the - # attribute in single quotes, and escaping any - # embedded single quotes to XML entities. - if '"' in val: - fmt = "%s='%s'" - if "'" in val: - # TODO: replace with apos when - # appropriate. - val = val.replace("'", "&squot;") - - # Now we're okay w/r/t quotes. But the attribute - # value might also contain angle brackets, or - # ampersands that aren't part of entities. We need - # to escape those to XML entities too. - val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) - - attrs.append(fmt % (self.toEncoding(key, encoding), - self.toEncoding(val, encoding))) - close = '' - closeTag = '' - if self.isSelfClosing: - close = ' /' - else: - closeTag = '' % encodedName - - indentTag, indentContents = 0, 0 - if prettyPrint: - indentTag = indentLevel - space = (' ' * (indentTag - 1)) - indentContents = indentTag + 1 - contents = self.renderContents(encoding, prettyPrint, indentContents) - if self.hidden: - s = contents - else: - s = [] - attributeString = '' - if attrs: - attributeString = ' ' + ' '.join(attrs) - if prettyPrint: - s.append(space) - s.append('<%s%s%s>' % (encodedName, attributeString, close)) - if prettyPrint: - s.append("\n") - s.append(contents) - if prettyPrint and contents and contents[-1] != "\n": - s.append("\n") - if prettyPrint and closeTag: - s.append(space) - s.append(closeTag) - if prettyPrint and closeTag and self.nextSibling: - s.append("\n") - s = ''.join(s) - return s - - def decompose(self): - """Recursively destroys the contents of this tree.""" - self.extract() - if len(self.contents) == 0: - return - current = self.contents[0] - while current is not None: - next = current.next - if isinstance(current, Tag): - del current.contents[:] - current.parent = None - current.previous = None - current.previousSibling = None - current.next = None - current.nextSibling = None - current = next - - def prettify(self, encoding = DEFAULT_OUTPUT_ENCODING): - return self.__str__(encoding, True) - - def renderContents(self, encoding = DEFAULT_OUTPUT_ENCODING, - prettyPrint = False, indentLevel = 0): - """Renders the contents of this tag as a string in the given - encoding. If encoding is None, returns a Unicode string..""" - s = [] - for c in self: - text = None - if isinstance(c, NavigableString): - text = c.__str__(encoding) - elif isinstance(c, Tag): - s.append(c.__str__(encoding, prettyPrint, indentLevel)) - if text and prettyPrint: - text = text.strip() - if text: - if prettyPrint: - s.append(" " * (indentLevel - 1)) - s.append(text) - if prettyPrint: - s.append("\n") - return ''.join(s) - - #Soup methods - - def find(self, name = None, attrs = {}, recursive = True, text = None, - **kwargs): - """Return only the first child of this Tag matching the given - criteria.""" - r = None - l = self.findAll(name, attrs, recursive, text, 1, **kwargs) - if l: - r = l[0] - return r - findChild = find - - def findAll(self, name = None, attrs = {}, recursive = True, text = None, - limit = None, **kwargs): - """Extracts a list of Tag objects that match the given - criteria. You can specify the name of the Tag and any - attributes you want the Tag to have. - - The value of a key-value pair in the 'attrs' map can be a - string, a list of strings, a regular expression object, or a - callable that takes a string and returns whether or not the - string matches for some custom definition of 'matches'. The - same is true of the tag name.""" - generator = self.recursiveChildGenerator - if not recursive: - generator = self.childGenerator - return self._findAll(name, attrs, text, limit, generator, **kwargs) - findChildren = findAll - - # Pre-3.x compatibility methods - first = find - fetch = findAll - - def fetchText(self, text = None, recursive = True, limit = None): - return self.findAll(text = text, recursive = recursive, limit = limit) - - def firstText(self, text = None, recursive = True): - return self.find(text = text, recursive = recursive) - - #Private methods - - def _getAttrMap(self): - """Initializes a map representation of this tag's attributes, - if not already initialized.""" - if not getattr(self, 'attrMap'): - self.attrMap = {} - for (key, value) in self.attrs: - self.attrMap[key] = value - return self.attrMap - - #Generator methods - def childGenerator(self): - # Just use the iterator from the contents - return iter(self.contents) - - def recursiveChildGenerator(self): - if not len(self.contents): - raise StopIteration - stopNode = self._lastRecursiveChild().next - current = self.contents[0] - while current is not stopNode: - yield current - current = current.next - - -# Next, a couple classes to represent queries and their results. -class SoupStrainer: - """Encapsulates a number of ways of matching a markup element (tag or - text).""" - - def __init__(self, name = None, attrs = {}, text = None, **kwargs): - self.name = name - if isinstance(attrs, basestring): - kwargs['class'] = _match_css_class(attrs) - attrs = None - if kwargs: - if attrs: - attrs = attrs.copy() - attrs.update(kwargs) - else: - attrs = kwargs - self.attrs = attrs - self.text = text - - def __str__(self): - if self.text: - return self.text - else: - return "%s|%s" % (self.name, self.attrs) - - def searchTag(self, markupName = None, markupAttrs = {}): - found = None - markup = None - if isinstance(markupName, Tag): - markup = markupName - markupAttrs = markup - callFunctionWithTagData = callable(self.name) \ - and not isinstance(markupName, Tag) - - if (not self.name) \ - or callFunctionWithTagData \ - or (markup and self._matches(markup, self.name)) \ - or (not markup and self._matches(markupName, self.name)): - if callFunctionWithTagData: - match = self.name(markupName, markupAttrs) - else: - match = True - markupAttrMap = None - for attr, matchAgainst in self.attrs.items(): - if not markupAttrMap: - if hasattr(markupAttrs, 'get'): - markupAttrMap = markupAttrs - else: - markupAttrMap = {} - for k, v in markupAttrs: - markupAttrMap[k] = v - attrValue = markupAttrMap.get(attr) - if not self._matches(attrValue, matchAgainst): - match = False - break - if match: - if markup: - found = markup - else: - found = markupName - return found - - def search(self, markup): - #print 'looking for %s in %s' % (self, markup) - found = None - # If given a list of items, scan it for a text element that - # matches. - if hasattr(markup, "__iter__") \ - and not isinstance(markup, Tag): - for element in markup: - if isinstance(element, NavigableString) \ - and self.search(element): - found = element - break - # If it's a Tag, make sure its name or attributes match. - # Don't bother with Tags if we're searching for text. - elif isinstance(markup, Tag): - if not self.text: - found = self.searchTag(markup) - # If it's text, make sure the text matches. - elif isinstance(markup, NavigableString) or \ - isinstance(markup, basestring): - if self._matches(markup, self.text): - found = markup - else: - raise Exception, "I don't know how to match against a %s" \ - % markup.__class__ - return found - - def _matches(self, markup, matchAgainst): - #print "Matching %s against %s" % (markup, matchAgainst) - result = False - if matchAgainst is True: - result = markup is not None - elif callable(matchAgainst): - result = matchAgainst(markup) - else: - #Custom match methods take the tag as an argument, but all - #other ways of matching match the tag name as a string. - if isinstance(markup, Tag): - markup = markup.name - if markup and not isinstance(markup, basestring): - markup = unicode(markup) - #Now we know that chunk is either a string, or None. - if hasattr(matchAgainst, 'match'): - # It's a regexp object. - result = markup and matchAgainst.search(markup) - elif hasattr(matchAgainst, '__iter__'): # list-like - result = markup in matchAgainst - elif hasattr(matchAgainst, 'items'): - result = markup.has_key(matchAgainst) - elif matchAgainst and isinstance(markup, basestring): - if isinstance(markup, unicode): - matchAgainst = unicode(matchAgainst) - else: - matchAgainst = str(matchAgainst) - - if not result: - result = matchAgainst == markup - return result - -class ResultSet(list): - """A ResultSet is just a list that keeps track of the SoupStrainer - that created it.""" - def __init__(self, source): - list.__init__([]) - self.source = source - -# Now, some helper functions. - -def buildTagMap(default, *args): - """Turns a list of maps, lists, or scalars into a single map. - Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and - NESTING_RESET_TAGS maps out of lists and partial maps.""" - built = {} - for portion in args: - if hasattr(portion, 'items'): - #It's a map. Merge it. - for k, v in portion.items(): - built[k] = v - elif hasattr(portion, '__iter__'): # is a list - #It's a list. Map each item to the default. - for k in portion: - built[k] = default - else: - #It's a scalar. Map it to the default. - built[portion] = default - return built - -# Now, the parser classes. - -class BeautifulStoneSoup(Tag, SGMLParser): - - """This class contains the basic parser and search code. It defines - a parser that knows nothing about tag behavior except for the - following: - - You can't close a tag without closing all the tags it encloses. - That is, "" actually means - "". - - [Another possible explanation is "", but since - this class defines no SELF_CLOSING_TAGS, it will never use that - explanation.] - - This class is useful for parsing XML or made-up markup languages, - or when BeautifulSoup makes an assumption counter to what you were - expecting.""" - - SELF_CLOSING_TAGS = {} - NESTABLE_TAGS = {} - RESET_NESTING_TAGS = {} - QUOTE_TAGS = {} - PRESERVE_WHITESPACE_TAGS = [] - - MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), - lambda x: x.group(1) + ' />'), - (re.compile(']*)>'), - lambda x: '') - ] - - ROOT_TAG_NAME = u'[document]' - - HTML_ENTITIES = "html" - XML_ENTITIES = "xml" - XHTML_ENTITIES = "xhtml" - # TODO: This only exists for backwards-compatibility - ALL_ENTITIES = XHTML_ENTITIES - - # Used when determining whether a text node is all whitespace and - # can be replaced with a single space. A text node that contains - # fancy Unicode spaces (usually non-breaking) should be left - # alone. - STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } - - def __init__(self, markup = "", parseOnlyThese = None, fromEncoding = None, - markupMassage = True, smartQuotesTo = XML_ENTITIES, - convertEntities = None, selfClosingTags = None, isHTML = False): - """The Soup object is initialized as the 'root tag', and the - provided markup (which can be a string or a file-like object) - is fed into the underlying parser. - - sgmllib will process most bad HTML, and the BeautifulSoup - class has some tricks for dealing with some HTML that kills - sgmllib, but Beautiful Soup can nonetheless choke or lose data - if your data uses self-closing tags or declarations - incorrectly. - - By default, Beautiful Soup uses regexes to sanitize input, - avoiding the vast majority of these problems. If the problems - don't apply to you, pass in False for markupMassage, and - you'll get better performance. - - The default parser massage techniques fix the two most common - instances of invalid HTML that choke sgmllib: - -
(No space between name of closing tag and tag close) - (Extraneous whitespace in declaration) - - You can pass in a custom list of (RE object, replace method) - tuples to get Beautiful Soup to scrub your input the way you - want.""" - - self.parseOnlyThese = parseOnlyThese - self.fromEncoding = fromEncoding - self.smartQuotesTo = smartQuotesTo - self.convertEntities = convertEntities - # Set the rules for how we'll deal with the entities we - # encounter - if self.convertEntities: - # It doesn't make sense to convert encoded characters to - # entities even while you're converting entities to Unicode. - # Just convert it all to Unicode. - self.smartQuotesTo = None - if convertEntities == self.HTML_ENTITIES: - self.convertXMLEntities = False - self.convertHTMLEntities = True - self.escapeUnrecognizedEntities = True - elif convertEntities == self.XHTML_ENTITIES: - self.convertXMLEntities = True - self.convertHTMLEntities = True - self.escapeUnrecognizedEntities = False - elif convertEntities == self.XML_ENTITIES: - self.convertXMLEntities = True - self.convertHTMLEntities = False - self.escapeUnrecognizedEntities = False - else: - self.convertXMLEntities = False - self.convertHTMLEntities = False - self.escapeUnrecognizedEntities = False - - self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) - SGMLParser.__init__(self) - - if hasattr(markup, 'read'): # It's a file-type object. - markup = markup.read() - self.markup = markup - self.markupMassage = markupMassage - try: - self._feed(isHTML = isHTML) - except StopParsing: - pass - self.markup = None # The markup can now be GCed - - def convert_charref(self, name): - """This method fixes a bug in Python's SGMLParser.""" - try: - n = int(name) - except ValueError: - return - if not 0 <= n <= 127 : # ASCII ends at 127, not 255 - return - return self.convert_codepoint(n) - - def _feed(self, inDocumentEncoding = None, isHTML = False): - # Convert the document to Unicode. - markup = self.markup - if isinstance(markup, unicode): - if not hasattr(self, 'originalEncoding'): - self.originalEncoding = None - else: - dammit = UnicodeDammit\ - (markup, [self.fromEncoding, inDocumentEncoding], - smartQuotesTo = self.smartQuotesTo, isHTML = isHTML) - markup = dammit.unicode - self.originalEncoding = dammit.originalEncoding - self.declaredHTMLEncoding = dammit.declaredHTMLEncoding - if markup: - if self.markupMassage: - if not hasattr(self.markupMassage, "__iter__"): - self.markupMassage = self.MARKUP_MASSAGE - for fix, m in self.markupMassage: - markup = fix.sub(m, markup) - # TODO: We get rid of markupMassage so that the - # soup object can be deepcopied later on. Some - # Python installations can't copy regexes. If anyone - # was relying on the existence of markupMassage, this - # might cause problems. - del(self.markupMassage) - self.reset() - - SGMLParser.feed(self, markup) - # Close out any unfinished strings and close all the open tags. - self.endData() - while self.currentTag.name != self.ROOT_TAG_NAME: - self.popTag() - - def __getattr__(self, methodName): - """This method routes method call requests to either the SGMLParser - superclass or the Tag superclass, depending on the method name.""" - #print "__getattr__ called on %s.%s" % (self.__class__, methodName) - - if methodName.startswith('start_') or methodName.startswith('end_') \ - or methodName.startswith('do_'): - return SGMLParser.__getattr__(self, methodName) - elif not methodName.startswith('__'): - return Tag.__getattr__(self, methodName) - else: - raise AttributeError - - def isSelfClosingTag(self, name): - """Returns true iff the given string is the name of a - self-closing tag according to this parser.""" - return self.SELF_CLOSING_TAGS.has_key(name) \ - or self.instanceSelfClosingTags.has_key(name) - - def reset(self): - Tag.__init__(self, self, self.ROOT_TAG_NAME) - self.hidden = 1 - SGMLParser.reset(self) - self.currentData = [] - self.currentTag = None - self.tagStack = [] - self.quoteStack = [] - self.pushTag(self) - - def popTag(self): - tag = self.tagStack.pop() - - #print "Pop", tag.name - if self.tagStack: - self.currentTag = self.tagStack[-1] - return self.currentTag - - def pushTag(self, tag): - #print "Push", tag.name - if self.currentTag: - self.currentTag.contents.append(tag) - self.tagStack.append(tag) - self.currentTag = self.tagStack[-1] - - def endData(self, containerClass = NavigableString): - if self.currentData: - currentData = u''.join(self.currentData) - if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and - not set([tag.name for tag in self.tagStack]).intersection( - self.PRESERVE_WHITESPACE_TAGS)): - if '\n' in currentData: - currentData = '\n' - else: - currentData = ' ' - self.currentData = [] - if self.parseOnlyThese and len(self.tagStack) <= 1 and \ - (not self.parseOnlyThese.text or \ - not self.parseOnlyThese.search(currentData)): - return - o = containerClass(currentData) - o.setup(self.currentTag, self.previous) - if self.previous: - self.previous.next = o - self.previous = o - self.currentTag.contents.append(o) - - - def _popToTag(self, name, inclusivePop = True): - """Pops the tag stack up to and including the most recent - instance of the given tag. If inclusivePop is false, pops the tag - stack up to but *not* including the most recent instqance of - the given tag.""" - #print "Popping to %s" % name - if name == self.ROOT_TAG_NAME: - return - - numPops = 0 - mostRecentTag = None - for i in range(len(self.tagStack) - 1, 0, -1): - if name == self.tagStack[i].name: - numPops = len(self.tagStack) - i - break - if not inclusivePop: - numPops = numPops - 1 - - for i in range(0, numPops): - mostRecentTag = self.popTag() - return mostRecentTag - - def _smartPop(self, name): - - """We need to pop up to the previous tag of this type, unless - one of this tag's nesting reset triggers comes between this - tag and the previous tag of this type, OR unless this tag is a - generic nesting trigger and another generic nesting trigger - comes between this tag and the previous tag of this type. - - Examples: -

FooBar *

* should pop to 'p', not 'b'. -

FooBar *

* should pop to 'table', not 'p'. -

Foo

Bar *

* should pop to 'tr', not 'p'. - -

    • *
    • * should pop to 'ul', not the first 'li'. -
  • ** should pop to 'table', not the first 'tr' - tag should - implicitly close the previous tag within the same
    ** should pop to 'tr', not the first 'td' - """ - - nestingResetTriggers = self.NESTABLE_TAGS.get(name) - isNestable = nestingResetTriggers != None - isResetNesting = self.RESET_NESTING_TAGS.has_key(name) - popTo = None - inclusive = True - for i in range(len(self.tagStack) - 1, 0, -1): - p = self.tagStack[i] - if (not p or p.name == name) and not isNestable: - #Non-nestable tags get popped to the top or to their - #last occurance. - popTo = name - break - if (nestingResetTriggers is not None - and p.name in nestingResetTriggers) \ - or (nestingResetTriggers is None and isResetNesting - and self.RESET_NESTING_TAGS.has_key(p.name)): - - #If we encounter one of the nesting reset triggers - #peculiar to this tag, or we encounter another tag - #that causes nesting to reset, pop up to but not - #including that tag. - popTo = p.name - inclusive = False - break - p = p.parent - if popTo: - self._popToTag(popTo, inclusive) - - def unknown_starttag(self, name, attrs, selfClosing = 0): - #print "Start tag %s: %s" % (name, attrs) - if self.quoteStack: - #This is not a real tag. - #print "<%s> is not real!" % name - attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs]) - self.handle_data('<%s%s>' % (name, attrs)) - return - self.endData() - - if not self.isSelfClosingTag(name) and not selfClosing: - self._smartPop(name) - - if self.parseOnlyThese and len(self.tagStack) <= 1 \ - and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): - return - - tag = Tag(self, name, attrs, self.currentTag, self.previous) - if self.previous: - self.previous.next = tag - self.previous = tag - self.pushTag(tag) - if selfClosing or self.isSelfClosingTag(name): - self.popTag() - if name in self.QUOTE_TAGS: - #print "Beginning quote (%s)" % name - self.quoteStack.append(name) - self.literal = 1 - return tag - - def unknown_endtag(self, name): - #print "End tag %s" % name - if self.quoteStack and self.quoteStack[-1] != name: - #This is not a real end tag. - #print " is not real!" % name - self.handle_data('' % name) - return - self.endData() - self._popToTag(name) - if self.quoteStack and self.quoteStack[-1] == name: - self.quoteStack.pop() - self.literal = (len(self.quoteStack) > 0) - - def handle_data(self, data): - self.currentData.append(data) - - def _toStringSubclass(self, text, subclass): - """Adds a certain piece of text to the tree as a NavigableString - subclass.""" - self.endData() - self.handle_data(text) - self.endData(subclass) - - def handle_pi(self, text): - """Handle a processing instruction as a ProcessingInstruction - object, possibly one with a %SOUP-ENCODING% slot into which an - encoding will be plugged later.""" - if text[:3] == "xml": - text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" - self._toStringSubclass(text, ProcessingInstruction) - - def handle_comment(self, text): - "Handle comments as Comment objects." - self._toStringSubclass(text, Comment) - - def handle_charref(self, ref): - "Handle character references as data." - if self.convertEntities: - data = unichr(int(ref)) - else: - data = '&#%s;' % ref - self.handle_data(data) - - def handle_entityref(self, ref): - """Handle entity references as data, possibly converting known - HTML and/or XML entity references to the corresponding Unicode - characters.""" - data = None - if self.convertHTMLEntities: - try: - data = unichr(name2codepoint[ref]) - except KeyError: - pass - - if not data and self.convertXMLEntities: - data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) - - if not data and self.convertHTMLEntities and \ - not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): - # TODO: We've got a problem here. We're told this is - # an entity reference, but it's not an XML entity - # reference or an HTML entity reference. Nonetheless, - # the logical thing to do is to pass it through as an - # unrecognized entity reference. - # - # Except: when the input is "&carol;" this function - # will be called with input "carol". When the input is - # "AT&T", this function will be called with input - # "T". We have no way of knowing whether a semicolon - # was present originally, so we don't know whether - # this is an unknown entity or just a misplaced - # ampersand. - # - # The more common case is a misplaced ampersand, so I - # escape the ampersand and omit the trailing semicolon. - data = "&%s" % ref - if not data: - # This case is different from the one above, because we - # haven't already gone through a supposedly comprehensive - # mapping of entities to Unicode characters. We might not - # have gone through any mapping at all. So the chances are - # very high that this is a real entity, and not a - # misplaced ampersand. - data = "&%s;" % ref - self.handle_data(data) - - def handle_decl(self, data): - "Handle DOCTYPEs and the like as Declaration objects." - self._toStringSubclass(data, Declaration) - - def parse_declaration(self, i): - """Treat a bogus SGML declaration as raw data. Treat a CDATA - declaration as a CData object.""" - j = None - if self.rawdata[i:i + 9] == '', i) - if k == -1: - k = len(self.rawdata) - data = self.rawdata[i + 9:k] - j = k + 3 - self._toStringSubclass(data, CData) - else: - try: - j = SGMLParser.parse_declaration(self, i) - except SGMLParseError: - toHandle = self.rawdata[i:] - self.handle_data(toHandle) - j = i + len(toHandle) - return j - -class BeautifulSoup(BeautifulStoneSoup): - - """This parser knows the following facts about HTML: - - * Some tags have no closing tag and should be interpreted as being - closed as soon as they are encountered. - - * The text inside some tags (ie. 'script') may contain tags which - are not really part of the document and which should be parsed - as text, not tags. If you want to parse the text as tags, you can - always fetch it and parse it explicitly. - - * Tag nesting rules: - - Most tags can't be nested at all. For instance, the occurance of - a

    tag should implicitly close the previous

    tag. - -

    Para1

    Para2 - should be transformed into: -

    Para1

    Para2 - - Some tags can be nested arbitrarily. For instance, the occurance - of a

    tag should _not_ implicitly close the previous -
    tag. - - Alice said:
    Bob said:
    Blah - should NOT be transformed into: - Alice said:
    Bob said:
    Blah - - Some tags can be nested, but the nesting is reset by the - interposition of other tags. For instance, a
    , - but not close a tag in another table. - -
    BlahBlah - should be transformed into: -
    BlahBlah - but, - Blah
    Blah - should NOT be transformed into - Blah
    Blah - - Differing assumptions about tag nesting rules are a major source - of problems with the BeautifulSoup class. If BeautifulSoup is not - treating as nestable a tag your page author treats as nestable, - try ICantBelieveItsBeautifulSoup, MinimalSoup, or - BeautifulStoneSoup before writing your own subclass.""" - - def __init__(self, *args, **kwargs): - if not kwargs.has_key('smartQuotesTo'): - kwargs['smartQuotesTo'] = self.HTML_ENTITIES - kwargs['isHTML'] = True - BeautifulStoneSoup.__init__(self, *args, **kwargs) - - SELF_CLOSING_TAGS = buildTagMap(None, - ('br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base', 'col')) - - PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) - - QUOTE_TAGS = {'script' : None, 'textarea' : None} - - #According to the HTML standard, each of these inline tags can - #contain another tag of the same type. Furthermore, it's common - #to actually use these tags this way. - NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', - 'center') - - #According to the HTML standard, these block tags can contain - #another tag of the same type. Furthermore, it's common - #to actually use these tags this way. - NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del') - - #Lists can contain other lists, but there are restrictions. - NESTABLE_LIST_TAGS = { 'ol' : [], - 'ul' : [], - 'li' : ['ul', 'ol'], - 'dl' : [], - 'dd' : ['dl'], - 'dt' : ['dl'] } - - #Tables can contain other tables, but there are restrictions. - NESTABLE_TABLE_TAGS = {'table' : [], - 'tr' : ['table', 'tbody', 'tfoot', 'thead'], - 'td' : ['tr'], - 'th' : ['tr'], - 'thead' : ['table'], - 'tbody' : ['table'], - 'tfoot' : ['table'], - } - - NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre') - - #If one of these tags is encountered, all tags up to the next tag of - #this type are popped. - RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', - NON_NESTABLE_BLOCK_TAGS, - NESTABLE_LIST_TAGS, - NESTABLE_TABLE_TAGS) - - NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, - NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) - - # Used to detect the charset in a META tag; see start_meta - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) - - def start_meta(self, attrs): - """Beautiful Soup can detect a charset included in a META tag, - try to convert the document to that charset, and re-parse the - document from the beginning.""" - httpEquiv = None - contentType = None - contentTypeIndex = None - tagNeedsEncodingSubstitution = False - - for i in range(0, len(attrs)): - key, value = attrs[i] - key = key.lower() - if key == 'http-equiv': - httpEquiv = value - elif key == 'content': - contentType = value - contentTypeIndex = i - - if httpEquiv and contentType: # It's an interesting meta tag. - match = self.CHARSET_RE.search(contentType) - if match: - if (self.declaredHTMLEncoding is not None or - self.originalEncoding == self.fromEncoding): - # An HTML encoding was sniffed while converting - # the document to Unicode, or an HTML encoding was - # sniffed during a previous pass through the - # document, or an encoding was specified - # explicitly and it worked. Rewrite the meta tag. - def rewrite(match): - return match.group(1) + "%SOUP-ENCODING%" - newAttr = self.CHARSET_RE.sub(rewrite, contentType) - attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], - newAttr) - tagNeedsEncodingSubstitution = True - else: - # This is our first pass through the document. - # Go through it again with the encoding information. - newCharset = match.group(3) - if newCharset and newCharset != self.originalEncoding: - self.declaredHTMLEncoding = newCharset - self._feed(self.declaredHTMLEncoding) - raise StopParsing - pass - tag = self.unknown_starttag("meta", attrs) - if tag and tagNeedsEncodingSubstitution: - tag.containsSubstitutions = True - -class StopParsing(Exception): - pass - -class ICantBelieveItsBeautifulSoup(BeautifulSoup): - - """The BeautifulSoup class is oriented towards skipping over - common HTML errors like unclosed tags. However, sometimes it makes - errors of its own. For instance, consider this fragment: - - FooBar - - This is perfectly valid (if bizarre) HTML. However, the - BeautifulSoup class will implicitly close the first b tag when it - encounters the second 'b'. It will think the author wrote - "FooBar", and didn't close the first 'b' tag, because - there's no real-world reason to bold something that's already - bold. When it encounters '' it will close two more 'b' - tags, for a grand total of three tags closed instead of two. This - can throw off the rest of your document structure. The same is - true of a number of other tags, listed below. - - It's much more common for someone to forget to close a 'b' tag - than to actually use nested 'b' tags, and the BeautifulSoup class - handles the common case. This class handles the not-co-common - case: where you can't believe someone wrote what they did, but - it's valid HTML and BeautifulSoup screwed up by assuming it - wouldn't be.""" - - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ - ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', - 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', - 'big') - - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',) - - NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) - -class MinimalSoup(BeautifulSoup): - """The MinimalSoup class is for parsing HTML that contains - pathologically bad markup. It makes no assumptions about tag - nesting, but it does know which tags are self-closing, that - ') + # => <script> do_nasty_stuff() </script> + # sanitize_html('Click here for $100') + # => Click here for $100 + def sanitize_token(self, token): + + # accommodate filters which use token_type differently + token_type = token["type"] + if token_type in tokenTypes.keys(): + token_type = tokenTypes[token_type] + + if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"], + tokenTypes["EmptyTag"]): + if token["name"] in self.allowed_elements: + if token.has_key("data"): + attrs = dict([(name,val) for name,val in + token["data"][::-1] + if name in self.allowed_attributes]) + for attr in self.attr_val_is_uri: + if not attrs.has_key(attr): + continue + val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', + unescape(attrs[attr])).lower() + #remove replacement characters from unescaped characters + val_unescaped = val_unescaped.replace(u"\ufffd", "") + if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and + (val_unescaped.split(':')[0] not in + self.allowed_protocols)): + del attrs[attr] + for attr in self.svg_attr_val_allows_ref: + if attr in attrs: + attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', + ' ', + unescape(attrs[attr])) + if (token["name"] in self.svg_allow_local_href and + 'xlink:href' in attrs and re.search('^\s*[^#\s].*', + attrs['xlink:href'])): + del attrs['xlink:href'] + if attrs.has_key('style'): + attrs['style'] = self.sanitize_css(attrs['style']) + token["data"] = [[name,val] for name,val in attrs.items()] + return token + else: + if token_type == tokenTypes["EndTag"]: + token["data"] = "" % token["name"] + elif token["data"]: + attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]]) + token["data"] = "<%s%s>" % (token["name"],attrs) + else: + token["data"] = "<%s>" % token["name"] + if token.get("selfClosing"): + token["data"]=token["data"][:-1] + "/>" + + if token["type"] in tokenTypes.keys(): + token["type"] = "Characters" + else: + token["type"] = tokenTypes["Characters"] + + del token["name"] + return token + elif token_type == tokenTypes["Comment"]: + pass + else: + return token + + def sanitize_css(self, style): + # disallow urls + style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style) + + # gauntlet + if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' + if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return '' + + clean = [] + for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): + if not value: continue + if prop.lower() in self.allowed_css_properties: + clean.append(prop + ': ' + value + ';') + elif prop.split('-')[0].lower() in ['background','border','margin', + 'padding']: + for keyword in value.split(): + if not keyword in self.acceptable_css_keywords and \ + not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword): + break + else: + clean.append(prop + ': ' + value + ';') + elif prop.lower() in self.allowed_svg_properties: + clean.append(prop + ': ' + value + ';') + + return ' '.join(clean) + +class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin): + def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, + lowercaseElementName=False, lowercaseAttrName=False, parser=None): + #Change case matching defaults as we only output lowercase html anyway + #This solution doesn't seem ideal... + HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet, + lowercaseElementName, lowercaseAttrName, parser=parser) + + def __iter__(self): + for token in HTMLTokenizer.__iter__(self): + token = self.sanitize_token(token) + if token: + yield token diff --git a/libs/html5lib/serializer/__init__.py b/libs/html5lib/serializer/__init__.py new file mode 100644 index 0000000..1b74665 --- /dev/null +++ b/libs/html5lib/serializer/__init__.py @@ -0,0 +1,17 @@ + +from html5lib import treewalkers + +from htmlserializer import HTMLSerializer +from xhtmlserializer import XHTMLSerializer + +def serialize(input, tree="simpletree", format="html", encoding=None, + **serializer_opts): + # XXX: Should we cache this? + walker = treewalkers.getTreeWalker(tree) + if format == "html": + s = HTMLSerializer(**serializer_opts) + elif format == "xhtml": + s = XHTMLSerializer(**serializer_opts) + else: + raise ValueError, "type must be either html or xhtml" + return s.render(walker(input), encoding) diff --git a/libs/html5lib/serializer/htmlserializer.py b/libs/html5lib/serializer/htmlserializer.py new file mode 100644 index 0000000..8dd0a81 --- /dev/null +++ b/libs/html5lib/serializer/htmlserializer.py @@ -0,0 +1,312 @@ +try: + frozenset +except NameError: + # Import from the sets module for python 2.3 + from sets import ImmutableSet as frozenset + +import gettext +_ = gettext.gettext + +from html5lib.constants import voidElements, booleanAttributes, spaceCharacters +from html5lib.constants import rcdataElements, entities, xmlEntities +from html5lib import utils +from xml.sax.saxutils import escape + +spaceCharacters = u"".join(spaceCharacters) + +try: + from codecs import register_error, xmlcharrefreplace_errors +except ImportError: + unicode_encode_errors = "strict" +else: + unicode_encode_errors = "htmlentityreplace" + + from html5lib.constants import entities + + encode_entity_map = {} + is_ucs4 = len(u"\U0010FFFF") == 1 + for k, v in entities.items(): + #skip multi-character entities + if ((is_ucs4 and len(v) > 1) or + (not is_ucs4 and len(v) > 2)): + continue + if v != "&": + if len(v) == 2: + v = utils.surrogatePairToCodepoint(v) + else: + try: + v = ord(v) + except: + print v + raise + if not v in encode_entity_map or k.islower(): + # prefer < over < and similarly for &, >, etc. + encode_entity_map[v] = k + + def htmlentityreplace_errors(exc): + if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): + res = [] + codepoints = [] + skip = False + for i, c in enumerate(exc.object[exc.start:exc.end]): + if skip: + skip = False + continue + index = i + exc.start + if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]): + codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2]) + skip = True + else: + codepoint = ord(c) + codepoints.append(codepoint) + for cp in codepoints: + e = encode_entity_map.get(cp) + if e: + res.append("&") + res.append(e) + if not e.endswith(";"): + res.append(";") + else: + res.append("&#x%s;"%(hex(cp)[2:])) + return (u"".join(res), exc.end) + else: + return xmlcharrefreplace_errors(exc) + + register_error(unicode_encode_errors, htmlentityreplace_errors) + + del register_error + + +class HTMLSerializer(object): + + # attribute quoting options + quote_attr_values = False + quote_char = u'"' + use_best_quote_char = True + + # tag syntax options + omit_optional_tags = True + minimize_boolean_attributes = True + use_trailing_solidus = False + space_before_trailing_solidus = True + + # escaping options + escape_lt_in_attrs = False + escape_rcdata = False + resolve_entities = True + + # miscellaneous options + inject_meta_charset = True + strip_whitespace = False + sanitize = False + + options = ("quote_attr_values", "quote_char", "use_best_quote_char", + "minimize_boolean_attributes", "use_trailing_solidus", + "space_before_trailing_solidus", "omit_optional_tags", + "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", + "escape_rcdata", "resolve_entities", "sanitize") + + def __init__(self, **kwargs): + """Initialize HTMLSerializer. + + Keyword options (default given first unless specified) include: + + inject_meta_charset=True|False + Whether it insert a meta element to define the character set of the + document. + quote_attr_values=True|False + Whether to quote attribute values that don't require quoting + per HTML5 parsing rules. + quote_char=u'"'|u"'" + Use given quote character for attribute quoting. Default is to + use double quote unless attribute value contains a double quote, + in which case single quotes are used instead. + escape_lt_in_attrs=False|True + Whether to escape < in attribute values. + escape_rcdata=False|True + Whether to escape characters that need to be escaped within normal + elements within rcdata elements such as style. + resolve_entities=True|False + Whether to resolve named character entities that appear in the + source tree. The XML predefined entities < > & " ' + are unaffected by this setting. + strip_whitespace=False|True + Whether to remove semantically meaningless whitespace. (This + compresses all whitespace to a single space except within pre.) + minimize_boolean_attributes=True|False + Shortens boolean attributes to give just the attribute value, + for example becomes . + use_trailing_solidus=False|True + Includes a close-tag slash at the end of the start tag of void + elements (empty elements whose end tag is forbidden). E.g.
    . + space_before_trailing_solidus=True|False + Places a space immediately before the closing slash in a tag + using a trailing solidus. E.g.
    . Requires use_trailing_solidus. + sanitize=False|True + Strip all unsafe or unknown constructs from output. + See `html5lib user documentation`_ + omit_optional_tags=True|False + Omit start/end tags that are optional. + + .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation + """ + if kwargs.has_key('quote_char'): + self.use_best_quote_char = False + for attr in self.options: + setattr(self, attr, kwargs.get(attr, getattr(self, attr))) + self.errors = [] + self.strict = False + + def encode(self, string): + assert(isinstance(string, unicode)) + if self.encoding: + return string.encode(self.encoding, unicode_encode_errors) + else: + return string + + def encodeStrict(self, string): + assert(isinstance(string, unicode)) + if self.encoding: + return string.encode(self.encoding, "strict") + else: + return string + + def serialize(self, treewalker, encoding=None): + self.encoding = encoding + in_cdata = False + self.errors = [] + if encoding and self.inject_meta_charset: + from html5lib.filters.inject_meta_charset import Filter + treewalker = Filter(treewalker, encoding) + # XXX: WhitespaceFilter should be used before OptionalTagFilter + # for maximum efficiently of this latter filter + if self.strip_whitespace: + from html5lib.filters.whitespace import Filter + treewalker = Filter(treewalker) + if self.sanitize: + from html5lib.filters.sanitizer import Filter + treewalker = Filter(treewalker) + if self.omit_optional_tags: + from html5lib.filters.optionaltags import Filter + treewalker = Filter(treewalker) + for token in treewalker: + type = token["type"] + if type == "Doctype": + doctype = u"= 0: + if token["systemId"].find(u"'") >= 0: + self.serializeError(_("System identifer contains both single and double quote characters")) + quote_char = u"'" + else: + quote_char = u'"' + doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char) + + doctype += u">" + yield self.encodeStrict(doctype) + + elif type in ("Characters", "SpaceCharacters"): + if type == "SpaceCharacters" or in_cdata: + if in_cdata and token["data"].find("= 0: + self.serializeError(_("Unexpected \"'=", False) + v = v.replace(u"&", u"&") + if self.escape_lt_in_attrs: v = v.replace(u"<", u"<") + if quote_attr: + quote_char = self.quote_char + if self.use_best_quote_char: + if u"'" in v and u'"' not in v: + quote_char = u'"' + elif u'"' in v and u"'" not in v: + quote_char = u"'" + if quote_char == u"'": + v = v.replace(u"'", u"'") + else: + v = v.replace(u'"', u""") + yield self.encodeStrict(quote_char) + yield self.encode(v) + yield self.encodeStrict(quote_char) + else: + yield self.encode(v) + if name in voidElements and self.use_trailing_solidus: + if self.space_before_trailing_solidus: + yield self.encodeStrict(u" /") + else: + yield self.encodeStrict(u"/") + yield self.encode(u">") + + elif type == "EndTag": + name = token["name"] + if name in rcdataElements: + in_cdata = False + elif in_cdata: + self.serializeError(_("Unexpected child element of a CDATA element")) + yield self.encodeStrict(u"" % name) + + elif type == "Comment": + data = token["data"] + if data.find("--") >= 0: + self.serializeError(_("Comment contains --")) + yield self.encodeStrict(u"" % token["data"]) + + elif type == "Entity": + name = token["name"] + key = name + ";" + if not key in entities: + self.serializeError(_("Entity %s not recognized" % name)) + if self.resolve_entities and key not in xmlEntities: + data = entities[key] + else: + data = u"&%s;" % name + yield self.encodeStrict(data) + + else: + self.serializeError(token["data"]) + + def render(self, treewalker, encoding=None): + if encoding: + return "".join(list(self.serialize(treewalker, encoding))) + else: + return u"".join(list(self.serialize(treewalker))) + + def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): + # XXX The idea is to make data mandatory. + self.errors.append(data) + if self.strict: + raise SerializeError + +def SerializeError(Exception): + """Error in serialized tree""" + pass diff --git a/libs/html5lib/serializer/xhtmlserializer.py b/libs/html5lib/serializer/xhtmlserializer.py new file mode 100644 index 0000000..7fdce47 --- /dev/null +++ b/libs/html5lib/serializer/xhtmlserializer.py @@ -0,0 +1,9 @@ +from htmlserializer import HTMLSerializer + +class XHTMLSerializer(HTMLSerializer): + quote_attr_values = True + minimize_boolean_attributes = False + use_trailing_solidus = True + escape_lt_in_attrs = True + omit_optional_tags = False + escape_rcdata = True diff --git a/libs/html5lib/tokenizer.py b/libs/html5lib/tokenizer.py new file mode 100644 index 0000000..7e9eca8 --- /dev/null +++ b/libs/html5lib/tokenizer.py @@ -0,0 +1,1744 @@ +try: + frozenset +except NameError: + # Import from the sets module for python 2.3 + from sets import Set as set + from sets import ImmutableSet as frozenset +try: + from collections import deque +except ImportError: + from utils import deque + +from constants import spaceCharacters +from constants import entitiesWindows1252, entities +from constants import asciiLowercase, asciiLetters, asciiUpper2Lower +from constants import digits, hexDigits, EOF +from constants import tokenTypes, tagTokenTypes +from constants import replacementCharacters + +from inputstream import HTMLInputStream + +# Group entities by their first character, for faster lookups +entitiesByFirstChar = {} +for e in entities: + entitiesByFirstChar.setdefault(e[0], []).append(e) + +class HTMLTokenizer(object): + """ This class takes care of tokenizing HTML. + + * self.currentToken + Holds the token that is currently being processed. + + * self.state + Holds a reference to the method to be invoked... XXX + + * self.stream + Points to HTMLInputStream object. + """ + + def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, + lowercaseElementName=True, lowercaseAttrName=True, parser=None): + + self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet) + self.parser = parser + + #Perform case conversions? + self.lowercaseElementName = lowercaseElementName + self.lowercaseAttrName = lowercaseAttrName + + # Setup the initial tokenizer state + self.escapeFlag = False + self.lastFourChars = [] + self.state = self.dataState + self.escape = False + + # The current token being created + self.currentToken = None + super(HTMLTokenizer, self).__init__() + + def __iter__(self): + """ This is where the magic happens. + + We do our usually processing through the states and when we have a token + to return we yield the token which pauses processing until the next token + is requested. + """ + self.tokenQueue = deque([]) + # Start processing. When EOF is reached self.state will return False + # instead of True and the loop will terminate. + while self.state(): + while self.stream.errors: + yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} + while self.tokenQueue: + yield self.tokenQueue.popleft() + + def consumeNumberEntity(self, isHex): + """This function returns either U+FFFD or the character based on the + decimal or hexadecimal representation. It also discards ";" if present. + If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. + """ + + allowed = digits + radix = 10 + if isHex: + allowed = hexDigits + radix = 16 + + charStack = [] + + # Consume all the characters that are in range while making sure we + # don't hit an EOF. + c = self.stream.char() + while c in allowed and c is not EOF: + charStack.append(c) + c = self.stream.char() + + # Convert the set of characters consumed to an int. + charAsInt = int("".join(charStack), radix) + + # Certain characters get replaced with others + if charAsInt in replacementCharacters: + char = replacementCharacters[charAsInt] + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "illegal-codepoint-for-numeric-entity", + "datavars": {"charAsInt": charAsInt}}) + elif ((0xD800 <= charAsInt <= 0xDFFF) or + (charAsInt > 0x10FFFF)): + char = u"\uFFFD" + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "illegal-codepoint-for-numeric-entity", + "datavars": {"charAsInt": charAsInt}}) + else: + #Should speed up this check somehow (e.g. move the set to a constant) + if ((0x0001 <= charAsInt <= 0x0008) or + (0x000E <= charAsInt <= 0x001F) or + (0x007F <= charAsInt <= 0x009F) or + (0xFDD0 <= charAsInt <= 0xFDEF) or + charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, + 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, + 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, + 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, + 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, + 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, + 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, + 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, + 0xFFFFF, 0x10FFFE, 0x10FFFF])): + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": + "illegal-codepoint-for-numeric-entity", + "datavars": {"charAsInt": charAsInt}}) + try: + # Try/except needed as UCS-2 Python builds' unichar only works + # within the BMP. + char = unichr(charAsInt) + except ValueError: + char = eval("u'\\U%08x'" % charAsInt) + + # Discard the ; if present. Otherwise, put it back on the queue and + # invoke parseError on parser. + if c != u";": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "numeric-entity-without-semicolon"}) + self.stream.unget(c) + + return char + + def consumeEntity(self, allowedChar=None, fromAttribute=False): + # Initialise to the default output for when no entity is matched + output = u"&" + + charStack = [self.stream.char()] + if (charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&") + or (allowedChar is not None and allowedChar == charStack[0])): + self.stream.unget(charStack[0]) + + elif charStack[0] == u"#": + # Read the next character to see if it's hex or decimal + hex = False + charStack.append(self.stream.char()) + if charStack[-1] in (u"x", u"X"): + hex = True + charStack.append(self.stream.char()) + + # charStack[-1] should be the first digit + if (hex and charStack[-1] in hexDigits) \ + or (not hex and charStack[-1] in digits): + # At least one digit found, so consume the whole number + self.stream.unget(charStack[-1]) + output = self.consumeNumberEntity(hex) + else: + # No digits found + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "expected-numeric-entity"}) + self.stream.unget(charStack.pop()) + output = u"&" + u"".join(charStack) + + else: + # At this point in the process might have named entity. Entities + # are stored in the global variable "entities". + # + # Consume characters and compare to these to a substring of the + # entity names in the list until the substring no longer matches. + filteredEntityList = entitiesByFirstChar.get(charStack[0], []) + + def entitiesStartingWith(name): + return [e for e in filteredEntityList if e.startswith(name)] + + while (charStack[-1] is not EOF and + entitiesStartingWith("".join(charStack))): + charStack.append(self.stream.char()) + + # At this point we have a string that starts with some characters + # that may match an entity + entityName = None + + # Try to find the longest entity the string will match to take care + # of ¬i for instance. + for entityLength in xrange(len(charStack)-1, 1, -1): + possibleEntityName = "".join(charStack[:entityLength]) + if possibleEntityName in entities: + entityName = possibleEntityName + break + + if entityName is not None: + if entityName[-1] != ";": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "named-entity-without-semicolon"}) + if (entityName[-1] != ";" and fromAttribute and + (charStack[entityLength] in asciiLetters or + charStack[entityLength] in digits or + charStack[entityLength] == "=")): + self.stream.unget(charStack.pop()) + output = u"&" + u"".join(charStack) + else: + output = entities[entityName] + self.stream.unget(charStack.pop()) + output += u"".join(charStack[entityLength:]) + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-named-entity"}) + self.stream.unget(charStack.pop()) + output = u"&" + u"".join(charStack) + + if fromAttribute: + self.currentToken["data"][-1][1] += output + else: + if output in spaceCharacters: + tokenType = "SpaceCharacters" + else: + tokenType = "Characters" + self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output}) + + def processEntityInAttribute(self, allowedChar): + """This method replaces the need for "entityInAttributeValueState". + """ + self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) + + def emitCurrentToken(self): + """This method is a generic handler for emitting the tags. It also sets + the state to "data" because that's what's needed after a token has been + emitted. + """ + token = self.currentToken + # Add token to the queue to be yielded + if (token["type"] in tagTokenTypes): + if self.lowercaseElementName: + token["name"] = token["name"].translate(asciiUpper2Lower) + if token["type"] == tokenTypes["EndTag"]: + if token["data"]: + self.tokenQueue.append({"type":tokenTypes["ParseError"], + "data":"attributes-in-end-tag"}) + if token["selfClosing"]: + self.tokenQueue.append({"type":tokenTypes["ParseError"], + "data":"self-closing-flag-on-end-tag"}) + self.tokenQueue.append(token) + self.state = self.dataState + + + # Below are the various tokenizer states worked out. + + def dataState(self): + data = self.stream.char() + if data == "&": + self.state = self.entityDataState + elif data == "<": + self.state = self.tagOpenState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data":"invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"\u0000"}) + elif data is EOF: + # Tokenization ends. + return False + elif data in spaceCharacters: + # Directly after emitting a token you switch back to the "data + # state". At that point spaceCharacters are important so they are + # emitted separately. + self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": + data + self.stream.charsUntil(spaceCharacters, True)}) + # No need to update lastFourChars here, since the first space will + # have already been appended to lastFourChars and will have broken + # any sequences + else: + chars = self.stream.charsUntil((u"&", u"<", u"\u0000")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) + return True + + def entityDataState(self): + self.consumeEntity() + self.state = self.dataState + return True + + def rcdataState(self): + data = self.stream.char() + if data == "&": + self.state = self.characterReferenceInRcdata + elif data == "<": + self.state = self.rcdataLessThanSignState + elif data == EOF: + # Tokenization ends. + return False + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"\uFFFD"}) + elif data in spaceCharacters: + # Directly after emitting a token you switch back to the "data + # state". At that point spaceCharacters are important so they are + # emitted separately. + self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": + data + self.stream.charsUntil(spaceCharacters, True)}) + # No need to update lastFourChars here, since the first space will + # have already been appended to lastFourChars and will have broken + # any sequences + else: + chars = self.stream.charsUntil((u"&", u"<")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) + return True + + def characterReferenceInRcdata(self): + self.consumeEntity() + self.state = self.rcdataState + return True + + def rawtextState(self): + data = self.stream.char() + if data == "<": + self.state = self.rawtextLessThanSignState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"\uFFFD"}) + elif data == EOF: + # Tokenization ends. + return False + else: + chars = self.stream.charsUntil((u"<", u"\u0000")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) + return True + + def scriptDataState(self): + data = self.stream.char() + if data == "<": + self.state = self.scriptDataLessThanSignState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"\uFFFD"}) + elif data == EOF: + # Tokenization ends. + return False + else: + chars = self.stream.charsUntil((u"<", u"\u0000")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) + return True + + def plaintextState(self): + data = self.stream.char() + if data == EOF: + # Tokenization ends. + return False + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"\uFFFD"}) + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + self.stream.charsUntil(u"\u0000")}) + return True + + def tagOpenState(self): + data = self.stream.char() + if data == u"!": + self.state = self.markupDeclarationOpenState + elif data == u"/": + self.state = self.closeTagOpenState + elif data in asciiLetters: + self.currentToken = {"type": tokenTypes["StartTag"], + "name": data, "data": [], + "selfClosing": False, + "selfClosingAcknowledged": False} + self.state = self.tagNameState + elif data == u">": + # XXX In theory it could be something besides a tag name. But + # do we really care? + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-tag-name-but-got-right-bracket"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"}) + self.state = self.dataState + elif data == u"?": + # XXX In theory it could be something besides a tag name. But + # do we really care? + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-tag-name-but-got-question-mark"}) + self.stream.unget(data) + self.state = self.bogusCommentState + else: + # XXX + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-tag-name"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.stream.unget(data) + self.state = self.dataState + return True + + def closeTagOpenState(self): + data = self.stream.char() + if data in asciiLetters: + self.currentToken = {"type": tokenTypes["EndTag"], "name": data, + "data": [], "selfClosing":False} + self.state = self.tagNameState + elif data == u">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-closing-tag-but-got-right-bracket"}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-closing-tag-but-got-eof"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"": + self.emitCurrentToken() + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-tag-name"}) + self.state = self.dataState + elif data == u"/": + self.state = self.selfClosingStartTagState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["name"] += u"\uFFFD" + else: + self.currentToken["name"] += data + # (Don't use charsUntil here, because tag names are + # very short and it's faster to not do anything fancy) + return True + + def rcdataLessThanSignState(self): + data = self.stream.char() + if data == "/": + self.temporaryBuffer = "" + self.state = self.rcdataEndTagOpenState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.stream.unget(data) + self.state = self.rcdataState + return True + + def rcdataEndTagOpenState(self): + data = self.stream.char() + if data in asciiLetters: + self.temporaryBuffer += data + self.state = self.rcdataEndTagNameState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.emitCurrentToken() + self.state = self.dataState + elif data in asciiLetters: + self.temporaryBuffer += data + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.emitCurrentToken() + self.state = self.dataState + elif data in asciiLetters: + self.temporaryBuffer += data + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.emitCurrentToken() + self.state = self.dataState + elif data in asciiLetters: + self.temporaryBuffer += data + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"}) + self.state = self.scriptDataState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"\uFFFD"}) + self.state = self.scriptDataEscapedState + elif data == EOF: + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.state = self.scriptDataEscapedState + return True + + def scriptDataEscapedLessThanSignState(self): + data = self.stream.char() + if data == "/": + self.temporaryBuffer = "" + self.state = self.scriptDataEscapedEndTagOpenState + elif data in asciiLetters: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<" + data}) + self.temporaryBuffer = data + self.state = self.scriptDataDoubleEscapeStartState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.stream.unget(data) + self.state = self.scriptDataEscapedState + return True + + def scriptDataEscapedEndTagOpenState(self): + data = self.stream.char() + if data in asciiLetters: + self.temporaryBuffer = data + self.state = self.scriptDataEscapedEndTagNameState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.emitCurrentToken() + self.state = self.dataState + elif data in asciiLetters: + self.temporaryBuffer += data + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u""))): + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + if self.temporaryBuffer.lower() == "script": + self.state = self.scriptDataDoubleEscapedState + else: + self.state = self.scriptDataEscapedState + elif data in asciiLetters: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.temporaryBuffer += data + else: + self.stream.unget(data) + self.state = self.scriptDataEscapedState + return True + + def scriptDataDoubleEscapedState(self): + data = self.stream.char() + if data == "-": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"}) + self.state = self.scriptDataDoubleEscapedDashState + elif data == "<": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.state = self.scriptDataDoubleEscapedLessThanSignState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"\uFFFD"}) + elif data == EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-script-in-script"}) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + return True + + def scriptDataDoubleEscapedDashState(self): + data = self.stream.char() + if data == "-": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"}) + self.state = self.scriptDataDoubleEscapedDashDashState + elif data == "<": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.state = self.scriptDataDoubleEscapedLessThanSignState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"\uFFFD"}) + self.state = self.scriptDataDoubleEscapedState + elif data == EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-script-in-script"}) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.state = self.scriptDataDoubleEscapedState + return True + + def scriptDataDoubleEscapedDashState(self): + data = self.stream.char() + if data == "-": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"}) + elif data == "<": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.state = self.scriptDataDoubleEscapedLessThanSignState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"}) + self.state = self.scriptDataState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"\uFFFD"}) + self.state = self.scriptDataDoubleEscapedState + elif data == EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-script-in-script"}) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.state = self.scriptDataDoubleEscapedState + return True + + def scriptDataDoubleEscapedLessThanSignState(self): + data = self.stream.char() + if data == "/": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"/"}) + self.temporaryBuffer = "" + self.state = self.scriptDataDoubleEscapeEndState + else: + self.stream.unget(data) + self.state = self.scriptDataDoubleEscapedState + return True + + def scriptDataDoubleEscapeEndState(self): + data = self.stream.char() + if data in (spaceCharacters | frozenset(("/", ">"))): + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + if self.temporaryBuffer.lower() == "script": + self.state = self.scriptDataEscapedState + else: + self.state = self.scriptDataDoubleEscapedState + elif data in asciiLetters: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.temporaryBuffer += data + else: + self.stream.unget(data) + self.state = self.scriptDataDoubleEscapedState + return True + + def beforeAttributeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + self.stream.charsUntil(spaceCharacters, True) + elif data in asciiLetters: + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + elif data == u">": + self.emitCurrentToken() + elif data == u"/": + self.state = self.selfClosingStartTagState + elif data in (u"'", u'"', u"=", u"<"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "invalid-character-in-attribute-name"}) + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"].append([u"\uFFFD", ""]) + self.state = self.attributeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-attribute-name-but-got-eof"}) + self.state = self.dataState + else: + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + return True + + def attributeNameState(self): + data = self.stream.char() + leavingThisState = True + emitToken = False + if data == u"=": + self.state = self.beforeAttributeValueState + elif data in asciiLetters: + self.currentToken["data"][-1][0] += data +\ + self.stream.charsUntil(asciiLetters, True) + leavingThisState = False + elif data == u">": + # XXX If we emit here the attributes are converted to a dict + # without being checked and when the code below runs we error + # because data is a dict not a list + emitToken = True + elif data in spaceCharacters: + self.state = self.afterAttributeNameState + elif data == u"/": + self.state = self.selfClosingStartTagState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"][-1][0] += u"\uFFFD" + leavingThisState = False + elif data in (u"'", u'"', u"<"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": + "invalid-character-in-attribute-name"}) + self.currentToken["data"][-1][0] += data + leavingThisState = False + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "eof-in-attribute-name"}) + self.state = self.dataState + else: + self.currentToken["data"][-1][0] += data + leavingThisState = False + + if leavingThisState: + # Attributes are not dropped at this stage. That happens when the + # start tag token is emitted so values can still be safely appended + # to attributes, but we do want to report the parse error in time. + if self.lowercaseAttrName: + self.currentToken["data"][-1][0] = ( + self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) + for name, value in self.currentToken["data"][:-1]: + if self.currentToken["data"][-1][0] == name: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "duplicate-attribute"}) + break + # XXX Fix for above XXX + if emitToken: + self.emitCurrentToken() + return True + + def afterAttributeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + self.stream.charsUntil(spaceCharacters, True) + elif data == u"=": + self.state = self.beforeAttributeValueState + elif data == u">": + self.emitCurrentToken() + elif data in asciiLetters: + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + elif data == u"/": + self.state = self.selfClosingStartTagState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"].append([u"\uFFFD", ""]) + self.state = self.attributeNameState + elif data in (u"'", u'"', u"<"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "invalid-character-after-attribute-name"}) + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-end-of-tag-but-got-eof"}) + self.state = self.dataState + else: + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + return True + + def beforeAttributeValueState(self): + data = self.stream.char() + if data in spaceCharacters: + self.stream.charsUntil(spaceCharacters, True) + elif data == u"\"": + self.state = self.attributeValueDoubleQuotedState + elif data == u"&": + self.state = self.attributeValueUnQuotedState + self.stream.unget(data); + elif data == u"'": + self.state = self.attributeValueSingleQuotedState + elif data == u">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-attribute-value-but-got-right-bracket"}) + self.emitCurrentToken() + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"][-1][1] += u"\uFFFD" + self.state = self.attributeValueUnQuotedState + elif data in (u"=", u"<", u"`"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "equals-in-unquoted-attribute-value"}) + self.currentToken["data"][-1][1] += data + self.state = self.attributeValueUnQuotedState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-attribute-value-but-got-eof"}) + self.state = self.dataState + else: + self.currentToken["data"][-1][1] += data + self.state = self.attributeValueUnQuotedState + return True + + def attributeValueDoubleQuotedState(self): + data = self.stream.char() + if data == "\"": + self.state = self.afterAttributeValueState + elif data == u"&": + self.processEntityInAttribute(u'"') + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"][-1][1] += u"\uFFFD" + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-value-double-quote"}) + self.state = self.dataState + else: + self.currentToken["data"][-1][1] += data +\ + self.stream.charsUntil(("\"", u"&")) + return True + + def attributeValueSingleQuotedState(self): + data = self.stream.char() + if data == "'": + self.state = self.afterAttributeValueState + elif data == u"&": + self.processEntityInAttribute(u"'") + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"][-1][1] += u"\uFFFD" + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-value-single-quote"}) + self.state = self.dataState + else: + self.currentToken["data"][-1][1] += data +\ + self.stream.charsUntil(("'", u"&")) + return True + + def attributeValueUnQuotedState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeAttributeNameState + elif data == u"&": + self.processEntityInAttribute(">") + elif data == u">": + self.emitCurrentToken() + elif data in (u'"', u"'", u"=", u"<", u"`"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-character-in-unquoted-attribute-value"}) + self.currentToken["data"][-1][1] += data + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"][-1][1] += u"\uFFFD" + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-value-no-quotes"}) + self.state = self.dataState + else: + self.currentToken["data"][-1][1] += data + self.stream.charsUntil( + frozenset((u"&", u">", u'"', u"'", u"=", u"<", u"`")) | spaceCharacters) + return True + + def afterAttributeValueState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeAttributeNameState + elif data == u">": + self.emitCurrentToken() + elif data == u"/": + self.state = self.selfClosingStartTagState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-EOF-after-attribute-value"}) + self.stream.unget(data) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-character-after-attribute-value"}) + self.stream.unget(data) + self.state = self.beforeAttributeNameState + return True + + def selfClosingStartTagState(self): + data = self.stream.char() + if data == ">": + self.currentToken["selfClosing"] = True + self.emitCurrentToken() + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": + "unexpected-EOF-after-solidus-in-tag"}) + self.stream.unget(data) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-character-after-soldius-in-tag"}) + self.stream.unget(data) + self.state = self.beforeAttributeNameState + return True + + def bogusCommentState(self): + # Make a new comment token and give it as value all the characters + # until the first > or EOF (charsUntil checks for EOF automatically) + # and emit it. + data = self.stream.charsUntil(u">") + data = data.replace(u"\u0000", u"\uFFFD") + self.tokenQueue.append( + {"type": tokenTypes["Comment"], "data": data}) + + # Eat the character directly after the bogus comment which is either a + # ">" or an EOF. + self.stream.char() + self.state = self.dataState + return True + + def markupDeclarationOpenState(self): + charStack = [self.stream.char()] + if charStack[-1] == u"-": + charStack.append(self.stream.char()) + if charStack[-1] == u"-": + self.currentToken = {"type": tokenTypes["Comment"], "data": u""} + self.state = self.commentStartState + return True + elif charStack[-1] in (u'd', u'D'): + matched = True + for expected in ((u'o', u'O'), (u'c', u'C'), (u't', u'T'), + (u'y', u'Y'), (u'p', u'P'), (u'e', u'E')): + charStack.append(self.stream.char()) + if charStack[-1] not in expected: + matched = False + break + if matched: + self.currentToken = {"type": tokenTypes["Doctype"], + "name": u"", + "publicId": None, "systemId": None, + "correct": True} + self.state = self.doctypeState + return True + elif (charStack[-1] == "[" and + self.parser is not None and + self.parser.tree.openElements and + self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace): + matched = True + for expected in ["C", "D", "A", "T", "A", "["]: + charStack.append(self.stream.char()) + if charStack[-1] != expected: + matched = False + break + if matched: + self.state = self.cdataSectionState + return True + + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-dashes-or-doctype"}) + + while charStack: + self.stream.unget(charStack.pop()) + self.state = self.bogusCommentState + return True + + def commentStartState(self): + data = self.stream.char() + if data == "-": + self.state = self.commentStartDashState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"] += u"\uFFFD" + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "incorrect-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += data + self.state = self.commentState + return True + + def commentStartDashState(self): + data = self.stream.char() + if data == "-": + self.state = self.commentEndState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"] += u"-\uFFFD" + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "incorrect-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += "-" + data + self.state = self.commentState + return True + + + def commentState(self): + data = self.stream.char() + if data == u"-": + self.state = self.commentEndDashState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"] += u"\uFFFD" + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "eof-in-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += data + \ + self.stream.charsUntil((u"-", u"\u0000")) + return True + + def commentEndDashState(self): + data = self.stream.char() + if data == u"-": + self.state = self.commentEndState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"] += u"-\uFFFD" + self.state = self.commentState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-end-dash"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += u"-" + data + self.state = self.commentState + return True + + def commentEndState(self): + data = self.stream.char() + if data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"] += u"--\uFFFD" + self.state = self.commentState + elif data == "!": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-bang-after-double-dash-in-comment"}) + self.state = self.commentEndBangState + elif data == u"-": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-dash-after-double-dash-in-comment"}) + self.currentToken["data"] += data + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-double-dash"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + # XXX + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-comment"}) + self.currentToken["data"] += u"--" + data + self.state = self.commentState + return True + + def commentEndBangState(self): + data = self.stream.char() + if data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == u"-": + self.currentToken["data"] += "--!" + self.state = self.commentEndDashState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"] += u"--!\uFFFD" + self.state = self.commentState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-end-bang-state"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += u"--!" + data + self.state = self.commentState + return True + + def doctypeState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeDoctypeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-doctype-name-but-got-eof"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "need-space-after-doctype"}) + self.stream.unget(data) + self.state = self.beforeDoctypeNameState + return True + + def beforeDoctypeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == u">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-doctype-name-but-got-right-bracket"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["name"] = u"\uFFFD" + self.state = self.doctypeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-doctype-name-but-got-eof"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["name"] = data + self.state = self.doctypeNameState + return True + + def doctypeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.state = self.afterDoctypeNameState + elif data == u">": + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["name"] += u"\uFFFD" + self.state = self.doctypeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype-name"}) + self.currentToken["correct"] = False + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["name"] += data + return True + + def afterDoctypeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.currentToken["correct"] = False + self.stream.unget(data) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + if data in (u"p", u"P"): + matched = True + for expected in ((u"u", u"U"), (u"b", u"B"), (u"l", u"L"), + (u"i", u"I"), (u"c", u"C")): + data = self.stream.char() + if data not in expected: + matched = False + break + if matched: + self.state = self.afterDoctypePublicKeywordState + return True + elif data in (u"s", u"S"): + matched = True + for expected in ((u"y", u"Y"), (u"s", u"S"), (u"t", u"T"), + (u"e", u"E"), (u"m", u"M")): + data = self.stream.char() + if data not in expected: + matched = False + break + if matched: + self.state = self.afterDoctypeSystemKeywordState + return True + + # All the characters read before the current 'data' will be + # [a-zA-Z], so they're garbage in the bogus doctype and can be + # discarded; only the latest character might be '>' or EOF + # and needs to be ungetted + self.stream.unget(data) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-space-or-right-bracket-in-doctype", "datavars": + {"data": data}}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + + return True + + def afterDoctypePublicKeywordState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeDoctypePublicIdentifierState + elif data in ("'", '"'): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.stream.unget(data) + self.state = self.beforeDoctypePublicIdentifierState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.stream.unget(data) + self.state = self.beforeDoctypePublicIdentifierState + return True + + def beforeDoctypePublicIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == "\"": + self.currentToken["publicId"] = u"" + self.state = self.doctypePublicIdentifierDoubleQuotedState + elif data == "'": + self.currentToken["publicId"] = u"" + self.state = self.doctypePublicIdentifierSingleQuotedState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + return True + + def doctypePublicIdentifierDoubleQuotedState(self): + data = self.stream.char() + if data == "\"": + self.state = self.afterDoctypePublicIdentifierState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["publicId"] += u"\uFFFD" + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["publicId"] += data + return True + + def doctypePublicIdentifierSingleQuotedState(self): + data = self.stream.char() + if data == "'": + self.state = self.afterDoctypePublicIdentifierState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["publicId"] += u"\uFFFD" + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["publicId"] += data + return True + + def afterDoctypePublicIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.betweenDoctypePublicAndSystemIdentifiersState + elif data == ">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == '"': + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["systemId"] = u"" + self.state = self.doctypeSystemIdentifierDoubleQuotedState + elif data == "'": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["systemId"] = u"" + self.state = self.doctypeSystemIdentifierSingleQuotedState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + return True + + def betweenDoctypePublicAndSystemIdentifiersState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == ">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == '"': + self.currentToken["systemId"] = u"" + self.state = self.doctypeSystemIdentifierDoubleQuotedState + elif data == "'": + self.currentToken["systemId"] = u"" + self.state = self.doctypeSystemIdentifierSingleQuotedState + elif data == EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + return True + + def afterDoctypeSystemKeywordState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeDoctypeSystemIdentifierState + elif data in ("'", '"'): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.stream.unget(data) + self.state = self.beforeDoctypeSystemIdentifierState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.stream.unget(data) + self.state = self.beforeDoctypeSystemIdentifierState + return True + + def beforeDoctypeSystemIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == "\"": + self.currentToken["systemId"] = u"" + self.state = self.doctypeSystemIdentifierDoubleQuotedState + elif data == "'": + self.currentToken["systemId"] = u"" + self.state = self.doctypeSystemIdentifierSingleQuotedState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + return True + + def doctypeSystemIdentifierDoubleQuotedState(self): + data = self.stream.char() + if data == "\"": + self.state = self.afterDoctypeSystemIdentifierState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["systemId"] += u"\uFFFD" + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["systemId"] += data + return True + + def doctypeSystemIdentifierSingleQuotedState(self): + data = self.stream.char() + if data == "'": + self.state = self.afterDoctypeSystemIdentifierState + elif data == u"\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["systemId"] += u"\uFFFD" + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["systemId"] += data + return True + + def afterDoctypeSystemIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == ">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.state = self.bogusDoctypeState + return True + + def bogusDoctypeState(self): + data = self.stream.char() + if data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + # XXX EMIT + self.stream.unget(data) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + pass + return True + + def cdataSectionState(self): + data = [] + while True: + data.append(self.stream.charsUntil(u"]")) + charStack = [] + + for expected in ["]", "]", ">"]: + charStack.append(self.stream.char()) + matched = True + if charStack[-1] == EOF: + data.extend(charStack[:-1]) + break + elif charStack[-1] != expected: + matched = False + data.extend(charStack) + break + + if matched: + break + data = "".join(data) + #Deal with null here rather than in the parser + nullCount = data.count(u"\u0000") + if nullCount > 0: + for i in xrange(nullCount): + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + data = data.replace(u"\u0000", u"\uFFFD") + if data: + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": data}) + self.state = self.dataState + return True diff --git a/libs/html5lib/treebuilders/__init__.py b/libs/html5lib/treebuilders/__init__.py new file mode 100755 index 0000000..14f66d4 --- /dev/null +++ b/libs/html5lib/treebuilders/__init__.py @@ -0,0 +1,96 @@ +"""A collection of modules for building different kinds of tree from +HTML documents. + +To create a treebuilder for a new type of tree, you need to do +implement several things: + +1) A set of classes for various types of elements: Document, Doctype, +Comment, Element. These must implement the interface of +_base.treebuilders.Node (although comment nodes have a different +signature for their constructor, see treebuilders.simpletree.Comment) +Textual content may also be implemented as another node type, or not, as +your tree implementation requires. + +2) A treebuilder object (called TreeBuilder by convention) that +inherits from treebuilders._base.TreeBuilder. This has 4 required attributes: +documentClass - the class to use for the bottommost node of a document +elementClass - the class to use for HTML Elements +commentClass - the class to use for comments +doctypeClass - the class to use for doctypes +It also has one required method: +getDocument - Returns the root node of the complete document tree + +3) If you wish to run the unit tests, you must also create a +testSerializer method on your treebuilder which accepts a node and +returns a string containing Node and its children serialized according +to the format used in the unittests + +The supplied simpletree module provides a python-only implementation +of a full treebuilder and is a useful reference for the semantics of +the various methods. +""" + +treeBuilderCache = {} + +import sys + +def getTreeBuilder(treeType, implementation=None, **kwargs): + """Get a TreeBuilder class for various types of tree with built-in support + + treeType - the name of the tree type required (case-insensitive). Supported + values are "simpletree", "dom", "etree" and "beautifulsoup" + + "simpletree" - a built-in DOM-ish tree type with support for some + more pythonic idioms. + "dom" - A generic builder for DOM implementations, defaulting to + a xml.dom.minidom based implementation for the sake of + backwards compatibility (as releases up until 0.10 had a + builder called "dom" that was a minidom implemenation). + "etree" - A generic builder for tree implementations exposing an + elementtree-like interface (known to work with + ElementTree, cElementTree and lxml.etree). + "beautifulsoup" - Beautiful soup (if installed) + + implementation - (Currently applies to the "etree" and "dom" tree types). A + module implementing the tree type e.g. + xml.etree.ElementTree or lxml.etree.""" + + treeType = treeType.lower() + if treeType not in treeBuilderCache: + if treeType == "dom": + import dom + # XXX: Keep backwards compatibility by using minidom if no implementation is given + if implementation == None: + from xml.dom import minidom + implementation = minidom + # XXX: NEVER cache here, caching is done in the dom submodule + return dom.getDomModule(implementation, **kwargs).TreeBuilder + elif treeType == "simpletree": + import simpletree + treeBuilderCache[treeType] = simpletree.TreeBuilder + elif treeType == "beautifulsoup": + import soup + treeBuilderCache[treeType] = soup.TreeBuilder + elif treeType == "lxml": + import etree_lxml + treeBuilderCache[treeType] = etree_lxml.TreeBuilder + elif treeType == "etree": + # Come up with a sane default + if implementation == None: + try: + import xml.etree.cElementTree as ET + except ImportError: + try: + import xml.etree.ElementTree as ET + except ImportError: + try: + import cElementTree as ET + except ImportError: + import elementtree.ElementTree as ET + implementation = ET + import etree + # NEVER cache here, caching is done in the etree submodule + return etree.getETreeModule(implementation, **kwargs).TreeBuilder + else: + raise ValueError("""Unrecognised treebuilder "%s" """%treeType) + return treeBuilderCache.get(treeType) diff --git a/libs/html5lib/treebuilders/_base.py b/libs/html5lib/treebuilders/_base.py new file mode 100755 index 0000000..f3782d2 --- /dev/null +++ b/libs/html5lib/treebuilders/_base.py @@ -0,0 +1,377 @@ +from html5lib.constants import scopingElements, tableInsertModeElements, namespaces +try: + frozenset +except NameError: + # Import from the sets module for python 2.3 + from sets import Set as set + from sets import ImmutableSet as frozenset + +# The scope markers are inserted when entering object elements, +# marquees, table cells, and table captions, and are used to prevent formatting +# from "leaking" into tables, object elements, and marquees. +Marker = None + +class Node(object): + def __init__(self, name): + """Node representing an item in the tree. + name - The tag name associated with the node + parent - The parent of the current node (or None for the document node) + value - The value of the current node (applies to text nodes and + comments + attributes - a dict holding name, value pairs for attributes of the node + childNodes - a list of child nodes of the current node. This must + include all elements but not necessarily other node types + _flags - A list of miscellaneous flags that can be set on the node + """ + self.name = name + self.parent = None + self.value = None + self.attributes = {} + self.childNodes = [] + self._flags = [] + + def __unicode__(self): + attributesStr = " ".join(["%s=\"%s\""%(name, value) + for name, value in + self.attributes.iteritems()]) + if attributesStr: + return "<%s %s>"%(self.name,attributesStr) + else: + return "<%s>"%(self.name) + + def __repr__(self): + return "<%s>" % (self.name) + + def appendChild(self, node): + """Insert node as a child of the current node + """ + raise NotImplementedError + + def insertText(self, data, insertBefore=None): + """Insert data as text in the current node, positioned before the + start of node insertBefore or to the end of the node's text. + """ + raise NotImplementedError + + def insertBefore(self, node, refNode): + """Insert node as a child of the current node, before refNode in the + list of child nodes. Raises ValueError if refNode is not a child of + the current node""" + raise NotImplementedError + + def removeChild(self, node): + """Remove node from the children of the current node + """ + raise NotImplementedError + + def reparentChildren(self, newParent): + """Move all the children of the current node to newParent. + This is needed so that trees that don't store text as nodes move the + text in the correct way + """ + #XXX - should this method be made more general? + for child in self.childNodes: + newParent.appendChild(child) + self.childNodes = [] + + def cloneNode(self): + """Return a shallow copy of the current node i.e. a node with the same + name and attributes but with no parent or child nodes + """ + raise NotImplementedError + + + def hasContent(self): + """Return true if the node has children or text, false otherwise + """ + raise NotImplementedError + +class ActiveFormattingElements(list): + def append(self, node): + equalCount = 0 + if node != Marker: + for element in self[::-1]: + if element == Marker: + break + if self.nodesEqual(element, node): + equalCount += 1 + if equalCount == 3: + self.remove(element) + break + list.append(self, node) + + def nodesEqual(self, node1, node2): + if not node1.nameTuple == node2.nameTuple: + return False + + if not node1.attributes == node2.attributes: + return False + + return True + +class TreeBuilder(object): + """Base treebuilder implementation + documentClass - the class to use for the bottommost node of a document + elementClass - the class to use for HTML Elements + commentClass - the class to use for comments + doctypeClass - the class to use for doctypes + """ + + #Document class + documentClass = None + + #The class to use for creating a node + elementClass = None + + #The class to use for creating comments + commentClass = None + + #The class to use for creating doctypes + doctypeClass = None + + #Fragment class + fragmentClass = None + + def __init__(self, namespaceHTMLElements): + if namespaceHTMLElements: + self.defaultNamespace = "http://www.w3.org/1999/xhtml" + else: + self.defaultNamespace = None + self.reset() + + def reset(self): + self.openElements = [] + self.activeFormattingElements = ActiveFormattingElements() + + #XXX - rename these to headElement, formElement + self.headPointer = None + self.formPointer = None + + self.insertFromTable = False + + self.document = self.documentClass() + + def elementInScope(self, target, variant=None): + + #If we pass a node in we match that. if we pass a string + #match any node with that name + exactNode = hasattr(target, "nameTuple") + + listElementsMap = { + None:(scopingElements, False), + "button":(scopingElements | set([(namespaces["html"], "button")]), False), + "list":(scopingElements | set([(namespaces["html"], "ol"), + (namespaces["html"], "ul")]), False), + "table":(set([(namespaces["html"], "html"), + (namespaces["html"], "table")]), False), + "select":(set([(namespaces["html"], "optgroup"), + (namespaces["html"], "option")]), True) + } + listElements, invert = listElementsMap[variant] + + for node in reversed(self.openElements): + if (node.name == target and not exactNode or + node == target and exactNode): + return True + elif (invert ^ (node.nameTuple in listElements)): + return False + + assert False # We should never reach this point + + def reconstructActiveFormattingElements(self): + # Within this algorithm the order of steps described in the + # specification is not quite the same as the order of steps in the + # code. It should still do the same though. + + # Step 1: stop the algorithm when there's nothing to do. + if not self.activeFormattingElements: + return + + # Step 2 and step 3: we start with the last element. So i is -1. + i = len(self.activeFormattingElements) - 1 + entry = self.activeFormattingElements[i] + if entry == Marker or entry in self.openElements: + return + + # Step 6 + while entry != Marker and entry not in self.openElements: + if i == 0: + #This will be reset to 0 below + i = -1 + break + i -= 1 + # Step 5: let entry be one earlier in the list. + entry = self.activeFormattingElements[i] + + while True: + # Step 7 + i += 1 + + # Step 8 + entry = self.activeFormattingElements[i] + clone = entry.cloneNode() #Mainly to get a new copy of the attributes + + # Step 9 + element = self.insertElement({"type":"StartTag", + "name":clone.name, + "namespace":clone.namespace, + "data":clone.attributes}) + + # Step 10 + self.activeFormattingElements[i] = element + + # Step 11 + if element == self.activeFormattingElements[-1]: + break + + def clearActiveFormattingElements(self): + entry = self.activeFormattingElements.pop() + while self.activeFormattingElements and entry != Marker: + entry = self.activeFormattingElements.pop() + + def elementInActiveFormattingElements(self, name): + """Check if an element exists between the end of the active + formatting elements and the last marker. If it does, return it, else + return false""" + + for item in self.activeFormattingElements[::-1]: + # Check for Marker first because if it's a Marker it doesn't have a + # name attribute. + if item == Marker: + break + elif item.name == name: + return item + return False + + def insertRoot(self, token): + element = self.createElement(token) + self.openElements.append(element) + self.document.appendChild(element) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + doctype = self.doctypeClass(name, publicId, systemId) + self.document.appendChild(doctype) + + def insertComment(self, token, parent=None): + if parent is None: + parent = self.openElements[-1] + parent.appendChild(self.commentClass(token["data"])) + + def createElement(self, token): + """Create an element but don't insert it anywhere""" + name = token["name"] + namespace = token.get("namespace", self.defaultNamespace) + element = self.elementClass(name, namespace) + element.attributes = token["data"] + return element + + def _getInsertFromTable(self): + return self._insertFromTable + + def _setInsertFromTable(self, value): + """Switch the function used to insert an element from the + normal one to the misnested table one and back again""" + self._insertFromTable = value + if value: + self.insertElement = self.insertElementTable + else: + self.insertElement = self.insertElementNormal + + insertFromTable = property(_getInsertFromTable, _setInsertFromTable) + + def insertElementNormal(self, token): + name = token["name"] + assert type(name) == unicode, "Element %s not unicode"%name + namespace = token.get("namespace", self.defaultNamespace) + element = self.elementClass(name, namespace) + element.attributes = token["data"] + self.openElements[-1].appendChild(element) + self.openElements.append(element) + return element + + def insertElementTable(self, token): + """Create an element and insert it into the tree""" + element = self.createElement(token) + if self.openElements[-1].name not in tableInsertModeElements: + return self.insertElementNormal(token) + else: + #We should be in the InTable mode. This means we want to do + #special magic element rearranging + parent, insertBefore = self.getTableMisnestedNodePosition() + if insertBefore is None: + parent.appendChild(element) + else: + parent.insertBefore(element, insertBefore) + self.openElements.append(element) + return element + + def insertText(self, data, parent=None): + """Insert text data.""" + if parent is None: + parent = self.openElements[-1] + + if (not self.insertFromTable or (self.insertFromTable and + self.openElements[-1].name + not in tableInsertModeElements)): + parent.insertText(data) + else: + # We should be in the InTable mode. This means we want to do + # special magic element rearranging + parent, insertBefore = self.getTableMisnestedNodePosition() + parent.insertText(data, insertBefore) + + def getTableMisnestedNodePosition(self): + """Get the foster parent element, and sibling to insert before + (or None) when inserting a misnested table node""" + # The foster parent element is the one which comes before the most + # recently opened table element + # XXX - this is really inelegant + lastTable=None + fosterParent = None + insertBefore = None + for elm in self.openElements[::-1]: + if elm.name == "table": + lastTable = elm + break + if lastTable: + # XXX - we should really check that this parent is actually a + # node here + if lastTable.parent: + fosterParent = lastTable.parent + insertBefore = lastTable + else: + fosterParent = self.openElements[ + self.openElements.index(lastTable) - 1] + else: + fosterParent = self.openElements[0] + return fosterParent, insertBefore + + def generateImpliedEndTags(self, exclude=None): + name = self.openElements[-1].name + # XXX td, th and tr are not actually needed + if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) + and name != exclude): + self.openElements.pop() + # XXX This is not entirely what the specification says. We should + # investigate it more closely. + self.generateImpliedEndTags(exclude) + + def getDocument(self): + "Return the final tree" + return self.document + + def getFragment(self): + "Return the final fragment" + #assert self.innerHTML + fragment = self.fragmentClass() + self.openElements[0].reparentChildren(fragment) + return fragment + + def testSerializer(self, node): + """Serialize the subtree of node in the format required by unit tests + node - the node from which to start serializing""" + raise NotImplementedError diff --git a/libs/html5lib/treebuilders/dom.py b/libs/html5lib/treebuilders/dom.py new file mode 100644 index 0000000..9578da2 --- /dev/null +++ b/libs/html5lib/treebuilders/dom.py @@ -0,0 +1,291 @@ + +from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE +try: + from types import ModuleType +except: + from new import module as ModuleType +import re +import weakref + +import _base +from html5lib import constants, ihatexml +from html5lib.constants import namespaces + +moduleCache = {} + +def getDomModule(DomImplementation): + name = "_" + DomImplementation.__name__+"builder" + if name in moduleCache: + return moduleCache[name] + else: + mod = ModuleType(name) + objs = getDomBuilder(DomImplementation) + mod.__dict__.update(objs) + moduleCache[name] = mod + return mod + +def getDomBuilder(DomImplementation): + Dom = DomImplementation + class AttrList(object): + def __init__(self, element): + self.element = element + def __iter__(self): + return self.element.attributes.items().__iter__() + def __setitem__(self, name, value): + self.element.setAttribute(name, value) + def __len__(self): + return len(self.element.attributes.items()) + def items(self): + return [(item[0], item[1]) for item in + self.element.attributes.items()] + def keys(self): + return self.element.attributes.keys() + def __getitem__(self, name): + return self.element.getAttribute(name) + + def __contains__(self, name): + if isinstance(name, tuple): + raise NotImplementedError + else: + return self.element.hasAttribute(name) + + class NodeBuilder(_base.Node): + def __init__(self, element): + _base.Node.__init__(self, element.nodeName) + self.element = element + + namespace = property(lambda self:hasattr(self.element, "namespaceURI") + and self.element.namespaceURI or None) + + def appendChild(self, node): + node.parent = self + self.element.appendChild(node.element) + + def insertText(self, data, insertBefore=None): + text = self.element.ownerDocument.createTextNode(data) + if insertBefore: + self.element.insertBefore(text, insertBefore.element) + else: + self.element.appendChild(text) + + def insertBefore(self, node, refNode): + self.element.insertBefore(node.element, refNode.element) + node.parent = self + + def removeChild(self, node): + if node.element.parentNode == self.element: + self.element.removeChild(node.element) + node.parent = None + + def reparentChildren(self, newParent): + while self.element.hasChildNodes(): + child = self.element.firstChild + self.element.removeChild(child) + newParent.element.appendChild(child) + self.childNodes = [] + + def getAttributes(self): + return AttrList(self.element) + + def setAttributes(self, attributes): + if attributes: + for name, value in attributes.items(): + if isinstance(name, tuple): + if name[0] is not None: + qualifiedName = (name[0] + ":" + name[1]) + else: + qualifiedName = name[1] + self.element.setAttributeNS(name[2], qualifiedName, + value) + else: + self.element.setAttribute( + name, value) + attributes = property(getAttributes, setAttributes) + + def cloneNode(self): + return NodeBuilder(self.element.cloneNode(False)) + + def hasContent(self): + return self.element.hasChildNodes() + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + + class TreeBuilder(_base.TreeBuilder): + def documentClass(self): + self.dom = Dom.getDOMImplementation().createDocument(None,None,None) + return weakref.proxy(self) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + domimpl = Dom.getDOMImplementation() + doctype = domimpl.createDocumentType(name, publicId, systemId) + self.document.appendChild(NodeBuilder(doctype)) + if Dom == minidom: + doctype.ownerDocument = self.dom + + def elementClass(self, name, namespace=None): + if namespace is None and self.defaultNamespace is None: + node = self.dom.createElement(name) + else: + node = self.dom.createElementNS(namespace, name) + + return NodeBuilder(node) + + def commentClass(self, data): + return NodeBuilder(self.dom.createComment(data)) + + def fragmentClass(self): + return NodeBuilder(self.dom.createDocumentFragment()) + + def appendChild(self, node): + self.dom.appendChild(node.element) + + def testSerializer(self, element): + return testSerializer(element) + + def getDocument(self): + return self.dom + + def getFragment(self): + return _base.TreeBuilder.getFragment(self).element + + def insertText(self, data, parent=None): + data=data + if parent <> self: + _base.TreeBuilder.insertText(self, data, parent) + else: + # HACK: allow text nodes as children of the document node + if hasattr(self.dom, '_child_node_types'): + if not Node.TEXT_NODE in self.dom._child_node_types: + self.dom._child_node_types=list(self.dom._child_node_types) + self.dom._child_node_types.append(Node.TEXT_NODE) + self.dom.appendChild(self.dom.createTextNode(data)) + + name = None + + def testSerializer(element): + element.normalize() + rv = [] + def serializeElement(element, indent=0): + if element.nodeType == Node.DOCUMENT_TYPE_NODE: + if element.name: + if element.publicId or element.systemId: + publicId = element.publicId or "" + systemId = element.systemId or "" + rv.append( """|%s"""%( + ' '*indent, element.name, publicId, systemId)) + else: + rv.append("|%s"%(' '*indent, element.name)) + else: + rv.append("|%s"%(' '*indent,)) + elif element.nodeType == Node.DOCUMENT_NODE: + rv.append("#document") + elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE: + rv.append("#document-fragment") + elif element.nodeType == Node.COMMENT_NODE: + rv.append("|%s"%(' '*indent, element.nodeValue)) + elif element.nodeType == Node.TEXT_NODE: + rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue)) + else: + if (hasattr(element, "namespaceURI") and + element.namespaceURI != None): + name = "%s %s"%(constants.prefixes[element.namespaceURI], + element.nodeName) + else: + name = element.nodeName + rv.append("|%s<%s>"%(' '*indent, name)) + if element.hasAttributes(): + attributes = [] + for i in range(len(element.attributes)): + attr = element.attributes.item(i) + name = attr.nodeName + value = attr.value + ns = attr.namespaceURI + if ns: + name = "%s %s"%(constants.prefixes[ns], attr.localName) + else: + name = attr.nodeName + attributes.append((name, value)) + + for name, value in sorted(attributes): + rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) + indent += 2 + for child in element.childNodes: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) + + def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}): + if node.nodeType == Node.ELEMENT_NODE: + if not nsmap: + handler.startElement(node.nodeName, node.attributes) + for child in node.childNodes: dom2sax(child, handler, nsmap) + handler.endElement(node.nodeName) + else: + attributes = dict(node.attributes.itemsNS()) + + # gather namespace declarations + prefixes = [] + for attrname in node.attributes.keys(): + attr = node.getAttributeNode(attrname) + if (attr.namespaceURI == XMLNS_NAMESPACE or + (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))): + prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None) + handler.startPrefixMapping(prefix, attr.nodeValue) + prefixes.append(prefix) + nsmap = nsmap.copy() + nsmap[prefix] = attr.nodeValue + del attributes[(attr.namespaceURI, attr.nodeName)] + + # apply namespace declarations + for attrname in node.attributes.keys(): + attr = node.getAttributeNode(attrname) + if attr.namespaceURI == None and ':' in attr.nodeName: + prefix = attr.nodeName.split(':')[0] + if nsmap.has_key(prefix): + del attributes[(attr.namespaceURI, attr.nodeName)] + attributes[(nsmap[prefix],attr.nodeName)]=attr.nodeValue + + # SAX events + ns = node.namespaceURI or nsmap.get(None,None) + handler.startElementNS((ns,node.nodeName), node.nodeName, attributes) + for child in node.childNodes: dom2sax(child, handler, nsmap) + handler.endElementNS((ns, node.nodeName), node.nodeName) + for prefix in prefixes: handler.endPrefixMapping(prefix) + + elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]: + handler.characters(node.nodeValue) + + elif node.nodeType == Node.DOCUMENT_NODE: + handler.startDocument() + for child in node.childNodes: dom2sax(child, handler, nsmap) + handler.endDocument() + + elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE: + for child in node.childNodes: dom2sax(child, handler, nsmap) + + else: + # ATTRIBUTE_NODE + # ENTITY_NODE + # PROCESSING_INSTRUCTION_NODE + # COMMENT_NODE + # DOCUMENT_TYPE_NODE + # NOTATION_NODE + pass + + return locals() + +# Keep backwards compatibility with things that directly load +# classes/functions from this module +for key, value in getDomModule(minidom).__dict__.items(): + globals()[key] = value diff --git a/libs/html5lib/treebuilders/etree.py b/libs/html5lib/treebuilders/etree.py new file mode 100755 index 0000000..95be475 --- /dev/null +++ b/libs/html5lib/treebuilders/etree.py @@ -0,0 +1,344 @@ +try: + from types import ModuleType +except: + from new import module as ModuleType +import re +import types + +import _base +from html5lib import ihatexml +from html5lib import constants +from html5lib.constants import namespaces + +tag_regexp = re.compile("{([^}]*)}(.*)") + +moduleCache = {} + +def getETreeModule(ElementTreeImplementation, fullTree=False): + name = "_" + ElementTreeImplementation.__name__+"builder" + if name in moduleCache: + return moduleCache[name] + else: + mod = ModuleType("_" + ElementTreeImplementation.__name__+"builder") + objs = getETreeBuilder(ElementTreeImplementation, fullTree) + mod.__dict__.update(objs) + moduleCache[name] = mod + return mod + +def getETreeBuilder(ElementTreeImplementation, fullTree=False): + ElementTree = ElementTreeImplementation + class Element(_base.Node): + def __init__(self, name, namespace=None): + self._name = name + self._namespace = namespace + self._element = ElementTree.Element(self._getETreeTag(name, + namespace)) + if namespace is None: + self.nameTuple = namespaces["html"], self._name + else: + self.nameTuple = self._namespace, self._name + self.parent = None + self._childNodes = [] + self._flags = [] + + def _getETreeTag(self, name, namespace): + if namespace is None: + etree_tag = name + else: + etree_tag = "{%s}%s"%(namespace, name) + return etree_tag + + def _setName(self, name): + self._name = name + self._element.tag = self._getETreeTag(self._name, self._namespace) + + def _getName(self): + return self._name + + name = property(_getName, _setName) + + def _setNamespace(self, namespace): + self._namespace = namespace + self._element.tag = self._getETreeTag(self._name, self._namespace) + + def _getNamespace(self): + return self._namespace + + namespace = property(_getNamespace, _setNamespace) + + def _getAttributes(self): + return self._element.attrib + + def _setAttributes(self, attributes): + #Delete existing attributes first + #XXX - there may be a better way to do this... + for key in self._element.attrib.keys(): + del self._element.attrib[key] + for key, value in attributes.iteritems(): + if isinstance(key, tuple): + name = "{%s}%s"%(key[2], key[1]) + else: + name = key + self._element.set(name, value) + + attributes = property(_getAttributes, _setAttributes) + + def _getChildNodes(self): + return self._childNodes + def _setChildNodes(self, value): + del self._element[:] + self._childNodes = [] + for element in value: + self.insertChild(element) + + childNodes = property(_getChildNodes, _setChildNodes) + + def hasContent(self): + """Return true if the node has children or text""" + return bool(self._element.text or len(self._element)) + + def appendChild(self, node): + self._childNodes.append(node) + self._element.append(node._element) + node.parent = self + + def insertBefore(self, node, refNode): + index = list(self._element).index(refNode._element) + self._element.insert(index, node._element) + node.parent = self + + def removeChild(self, node): + self._element.remove(node._element) + node.parent=None + + def insertText(self, data, insertBefore=None): + if not(len(self._element)): + if not self._element.text: + self._element.text = "" + self._element.text += data + elif insertBefore is None: + #Insert the text as the tail of the last child element + if not self._element[-1].tail: + self._element[-1].tail = "" + self._element[-1].tail += data + else: + #Insert the text before the specified node + children = list(self._element) + index = children.index(insertBefore._element) + if index > 0: + if not self._element[index-1].tail: + self._element[index-1].tail = "" + self._element[index-1].tail += data + else: + if not self._element.text: + self._element.text = "" + self._element.text += data + + def cloneNode(self): + element = type(self)(self.name, self.namespace) + for name, value in self.attributes.iteritems(): + element.attributes[name] = value + return element + + def reparentChildren(self, newParent): + if newParent.childNodes: + newParent.childNodes[-1]._element.tail += self._element.text + else: + if not newParent._element.text: + newParent._element.text = "" + if self._element.text is not None: + newParent._element.text += self._element.text + self._element.text = "" + _base.Node.reparentChildren(self, newParent) + + class Comment(Element): + def __init__(self, data): + #Use the superclass constructor to set all properties on the + #wrapper element + self._element = ElementTree.Comment(data) + self.parent = None + self._childNodes = [] + self._flags = [] + + def _getData(self): + return self._element.text + + def _setData(self, value): + self._element.text = value + + data = property(_getData, _setData) + + class DocumentType(Element): + def __init__(self, name, publicId, systemId): + Element.__init__(self, "") + self._element.text = name + self.publicId = publicId + self.systemId = systemId + + def _getPublicId(self): + return self._element.get(u"publicId", "") + + def _setPublicId(self, value): + if value is not None: + self._element.set(u"publicId", value) + + publicId = property(_getPublicId, _setPublicId) + + def _getSystemId(self): + return self._element.get(u"systemId", "") + + def _setSystemId(self, value): + if value is not None: + self._element.set(u"systemId", value) + + systemId = property(_getSystemId, _setSystemId) + + class Document(Element): + def __init__(self): + Element.__init__(self, "") + + class DocumentFragment(Element): + def __init__(self): + Element.__init__(self, "") + + def testSerializer(element): + rv = [] + finalText = None + def serializeElement(element, indent=0): + if not(hasattr(element, "tag")): + element = element.getroot() + if element.tag == "": + if element.get("publicId") or element.get("systemId"): + publicId = element.get("publicId") or "" + systemId = element.get("systemId") or "" + rv.append( """"""%( + element.text, publicId, systemId)) + else: + rv.append(""%(element.text,)) + elif element.tag == "": + rv.append("#document") + if element.text: + rv.append("|%s\"%s\""%(' '*(indent+2), element.text)) + if element.tail: + finalText = element.tail + elif element.tag == ElementTree.Comment: + rv.append("|%s"%(' '*indent, element.text)) + else: + assert type(element.tag) in types.StringTypes, "Expected unicode, got %s"%type(element.tag) + nsmatch = tag_regexp.match(element.tag) + + if nsmatch is None: + name = element.tag + else: + ns, name = nsmatch.groups() + prefix = constants.prefixes[ns] + name = "%s %s"%(prefix, name) + rv.append("|%s<%s>"%(' '*indent, name)) + + if hasattr(element, "attrib"): + attributes = [] + for name, value in element.attrib.iteritems(): + nsmatch = tag_regexp.match(name) + if nsmatch is not None: + ns, name = nsmatch.groups() + prefix = constants.prefixes[ns] + attr_string = "%s %s"%(prefix, name) + else: + attr_string = name + attributes.append((attr_string, value)) + + for name, value in sorted(attributes): + rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) + if element.text: + rv.append("|%s\"%s\"" %(' '*(indent+2), element.text)) + indent += 2 + for child in element: + serializeElement(child, indent) + if element.tail: + rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail)) + serializeElement(element, 0) + + if finalText is not None: + rv.append("|%s\"%s\""%(' '*2, finalText)) + + return "\n".join(rv) + + def tostring(element): + """Serialize an element and its child nodes to a string""" + rv = [] + finalText = None + filter = ihatexml.InfosetFilter() + def serializeElement(element): + if type(element) == type(ElementTree.ElementTree): + element = element.getroot() + + if element.tag == "": + if element.get("publicId") or element.get("systemId"): + publicId = element.get("publicId") or "" + systemId = element.get("systemId") or "" + rv.append( """"""%( + element.text, publicId, systemId)) + else: + rv.append(""%(element.text,)) + elif element.tag == "": + if element.text: + rv.append(element.text) + if element.tail: + finalText = element.tail + + for child in element: + serializeElement(child) + + elif type(element.tag) == type(ElementTree.Comment): + rv.append(""%(element.text,)) + else: + #This is assumed to be an ordinary element + if not element.attrib: + rv.append("<%s>"%(filter.fromXmlName(element.tag),)) + else: + attr = " ".join(["%s=\"%s\""%( + filter.fromXmlName(name), value) + for name, value in element.attrib.iteritems()]) + rv.append("<%s %s>"%(element.tag, attr)) + if element.text: + rv.append(element.text) + + for child in element: + serializeElement(child) + + rv.append(""%(element.tag,)) + + if element.tail: + rv.append(element.tail) + + serializeElement(element) + + if finalText is not None: + rv.append("%s\""%(' '*2, finalText)) + + return "".join(rv) + + class TreeBuilder(_base.TreeBuilder): + documentClass = Document + doctypeClass = DocumentType + elementClass = Element + commentClass = Comment + fragmentClass = DocumentFragment + + def testSerializer(self, element): + return testSerializer(element) + + def getDocument(self): + if fullTree: + return self.document._element + else: + if self.defaultNamespace is not None: + return self.document._element.find( + "{%s}html"%self.defaultNamespace) + else: + return self.document._element.find("html") + + def getFragment(self): + return _base.TreeBuilder.getFragment(self)._element + + return locals() diff --git a/libs/html5lib/treebuilders/etree_lxml.py b/libs/html5lib/treebuilders/etree_lxml.py new file mode 100644 index 0000000..eee1e3b --- /dev/null +++ b/libs/html5lib/treebuilders/etree_lxml.py @@ -0,0 +1,336 @@ +import warnings +import re + +import _base +from html5lib.constants import DataLossWarning +import html5lib.constants as constants +import etree as etree_builders +from html5lib import ihatexml + +try: + import lxml.etree as etree +except ImportError: + pass + +fullTree = True +tag_regexp = re.compile("{([^}]*)}(.*)") + +"""Module for supporting the lxml.etree library. The idea here is to use as much +of the native library as possible, without using fragile hacks like custom element +names that break between releases. The downside of this is that we cannot represent +all possible trees; specifically the following are known to cause problems: + +Text or comments as siblings of the root element +Docypes with no name + +When any of these things occur, we emit a DataLossWarning +""" + +class DocumentType(object): + def __init__(self, name, publicId, systemId): + self.name = name + self.publicId = publicId + self.systemId = systemId + +class Document(object): + def __init__(self): + self._elementTree = None + self._childNodes = [] + + def appendChild(self, element): + self._elementTree.getroot().addnext(element._element) + + def _getChildNodes(self): + return self._childNodes + + childNodes = property(_getChildNodes) + +def testSerializer(element): + rv = [] + finalText = None + filter = ihatexml.InfosetFilter() + def serializeElement(element, indent=0): + if not hasattr(element, "tag"): + if hasattr(element, "getroot"): + #Full tree case + rv.append("#document") + if element.docinfo.internalDTD: + if not (element.docinfo.public_id or + element.docinfo.system_url): + dtd_str = ""%element.docinfo.root_name + else: + dtd_str = """"""%( + element.docinfo.root_name, + element.docinfo.public_id, + element.docinfo.system_url) + rv.append("|%s%s"%(' '*(indent+2), dtd_str)) + next_element = element.getroot() + while next_element.getprevious() is not None: + next_element = next_element.getprevious() + while next_element is not None: + serializeElement(next_element, indent+2) + next_element = next_element.getnext() + elif isinstance(element, basestring): + #Text in a fragment + rv.append("|%s\"%s\""%(' '*indent, element)) + else: + #Fragment case + rv.append("#document-fragment") + for next_element in element: + serializeElement(next_element, indent+2) + elif type(element.tag) == type(etree.Comment): + rv.append("|%s"%(' '*indent, element.text)) + else: + nsmatch = etree_builders.tag_regexp.match(element.tag) + if nsmatch is not None: + ns = nsmatch.group(1) + tag = nsmatch.group(2) + prefix = constants.prefixes[ns] + rv.append("|%s<%s %s>"%(' '*indent, prefix, + filter.fromXmlName(tag))) + else: + rv.append("|%s<%s>"%(' '*indent, + filter.fromXmlName(element.tag))) + + if hasattr(element, "attrib"): + attributes = [] + for name, value in element.attrib.iteritems(): + nsmatch = tag_regexp.match(name) + if nsmatch is not None: + ns, name = nsmatch.groups() + name = filter.fromXmlName(name) + prefix = constants.prefixes[ns] + attr_string = "%s %s"%(prefix, name) + else: + attr_string = filter.fromXmlName(name) + attributes.append((attr_string, value)) + + for name, value in sorted(attributes): + rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) + + if element.text: + rv.append("|%s\"%s\"" %(' '*(indent+2), element.text)) + indent += 2 + for child in element.getchildren(): + serializeElement(child, indent) + if hasattr(element, "tail") and element.tail: + rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail)) + serializeElement(element, 0) + + if finalText is not None: + rv.append("|%s\"%s\""%(' '*2, finalText)) + + return "\n".join(rv) + +def tostring(element): + """Serialize an element and its child nodes to a string""" + rv = [] + finalText = None + def serializeElement(element): + if not hasattr(element, "tag"): + if element.docinfo.internalDTD: + if element.docinfo.doctype: + dtd_str = element.docinfo.doctype + else: + dtd_str = ""%element.docinfo.root_name + rv.append(dtd_str) + serializeElement(element.getroot()) + + elif type(element.tag) == type(etree.Comment): + rv.append(""%(element.text,)) + + else: + #This is assumed to be an ordinary element + if not element.attrib: + rv.append("<%s>"%(element.tag,)) + else: + attr = " ".join(["%s=\"%s\""%(name, value) + for name, value in element.attrib.iteritems()]) + rv.append("<%s %s>"%(element.tag, attr)) + if element.text: + rv.append(element.text) + + for child in element.getchildren(): + serializeElement(child) + + rv.append(""%(element.tag,)) + + if hasattr(element, "tail") and element.tail: + rv.append(element.tail) + + serializeElement(element) + + if finalText is not None: + rv.append("%s\""%(' '*2, finalText)) + + return "".join(rv) + + +class TreeBuilder(_base.TreeBuilder): + documentClass = Document + doctypeClass = DocumentType + elementClass = None + commentClass = None + fragmentClass = Document + + def __init__(self, namespaceHTMLElements, fullTree = False): + builder = etree_builders.getETreeModule(etree, fullTree=fullTree) + filter = self.filter = ihatexml.InfosetFilter() + self.namespaceHTMLElements = namespaceHTMLElements + + class Attributes(dict): + def __init__(self, element, value={}): + self._element = element + dict.__init__(self, value) + for key, value in self.iteritems(): + if isinstance(key, tuple): + name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1])) + else: + name = filter.coerceAttribute(key) + self._element._element.attrib[name] = value + + def __setitem__(self, key, value): + dict.__setitem__(self, key, value) + if isinstance(key, tuple): + name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1])) + else: + name = filter.coerceAttribute(key) + self._element._element.attrib[name] = value + + class Element(builder.Element): + def __init__(self, name, namespace): + name = filter.coerceElement(name) + builder.Element.__init__(self, name, namespace=namespace) + self._attributes = Attributes(self) + + def _setName(self, name): + self._name = filter.coerceElement(name) + self._element.tag = self._getETreeTag( + self._name, self._namespace) + + def _getName(self): + return filter.fromXmlName(self._name) + + name = property(_getName, _setName) + + def _getAttributes(self): + return self._attributes + + def _setAttributes(self, attributes): + self._attributes = Attributes(self, attributes) + + attributes = property(_getAttributes, _setAttributes) + + def insertText(self, data, insertBefore=None): + data = filter.coerceCharacters(data) + builder.Element.insertText(self, data, insertBefore) + + def appendChild(self, child): + builder.Element.appendChild(self, child) + + + class Comment(builder.Comment): + def __init__(self, data): + data = filter.coerceComment(data) + builder.Comment.__init__(self, data) + + def _setData(self, data): + data = filter.coerceComment(data) + self._element.text = data + + def _getData(self): + return self._element.text + + data = property(_getData, _setData) + + self.elementClass = Element + self.commentClass = builder.Comment + #self.fragmentClass = builder.DocumentFragment + _base.TreeBuilder.__init__(self, namespaceHTMLElements) + + def reset(self): + _base.TreeBuilder.reset(self) + self.insertComment = self.insertCommentInitial + self.initial_comments = [] + self.doctype = None + + def testSerializer(self, element): + return testSerializer(element) + + def getDocument(self): + if fullTree: + return self.document._elementTree + else: + return self.document._elementTree.getroot() + + def getFragment(self): + fragment = [] + element = self.openElements[0]._element + if element.text: + fragment.append(element.text) + fragment.extend(element.getchildren()) + if element.tail: + fragment.append(element.tail) + return fragment + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"': + warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning) + + doctype = self.doctypeClass(name, publicId, systemId) + self.doctype = doctype + + def insertCommentInitial(self, data, parent=None): + self.initial_comments.append(data) + + def insertRoot(self, token): + """Create the document root""" + #Because of the way libxml2 works, it doesn't seem to be possible to + #alter information like the doctype after the tree has been parsed. + #Therefore we need to use the built-in parser to create our iniial + #tree, after which we can add elements like normal + docStr = "" + if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'): + docStr += "" + docStr += "" + + try: + root = etree.fromstring(docStr) + except etree.XMLSyntaxError: + print docStr + raise + + #Append the initial comments: + for comment_token in self.initial_comments: + root.addprevious(etree.Comment(comment_token["data"])) + + #Create the root document and add the ElementTree to it + self.document = self.documentClass() + self.document._elementTree = root.getroottree() + + # Give the root element the right name + name = token["name"] + namespace = token.get("namespace", self.defaultNamespace) + if namespace is None: + etree_tag = name + else: + etree_tag = "{%s}%s"%(namespace, name) + root.tag = etree_tag + + #Add the root element to the internal child/open data structures + root_element = self.elementClass(name, namespace) + root_element._element = root + self.document._childNodes.append(root_element) + self.openElements.append(root_element) + + #Reset to the default insert comment function + self.insertComment = super(TreeBuilder, self).insertComment diff --git a/libs/html5lib/treebuilders/simpletree.py b/libs/html5lib/treebuilders/simpletree.py new file mode 100755 index 0000000..67fe758 --- /dev/null +++ b/libs/html5lib/treebuilders/simpletree.py @@ -0,0 +1,256 @@ +import _base +from html5lib.constants import voidElements, namespaces, prefixes +from xml.sax.saxutils import escape + +# Really crappy basic implementation of a DOM-core like thing +class Node(_base.Node): + type = -1 + def __init__(self, name): + self.name = name + self.parent = None + self.value = None + self.childNodes = [] + self._flags = [] + + def __iter__(self): + for node in self.childNodes: + yield node + for item in node: + yield item + + def __unicode__(self): + return self.name + + def toxml(self): + raise NotImplementedError + + def printTree(self, indent=0): + tree = '\n|%s%s' % (' '* indent, unicode(self)) + for child in self.childNodes: + tree += child.printTree(indent + 2) + return tree + + def appendChild(self, node): + assert isinstance(node, Node) + if (isinstance(node, TextNode) and self.childNodes and + isinstance(self.childNodes[-1], TextNode)): + self.childNodes[-1].value += node.value + else: + self.childNodes.append(node) + node.parent = self + + def insertText(self, data, insertBefore=None): + assert isinstance(data, unicode), "data %s is of type %s expected unicode"%(repr(data), type(data)) + if insertBefore is None: + self.appendChild(TextNode(data)) + else: + self.insertBefore(TextNode(data), insertBefore) + + def insertBefore(self, node, refNode): + index = self.childNodes.index(refNode) + if (isinstance(node, TextNode) and index > 0 and + isinstance(self.childNodes[index - 1], TextNode)): + self.childNodes[index - 1].value += node.value + else: + self.childNodes.insert(index, node) + node.parent = self + + def removeChild(self, node): + try: + self.childNodes.remove(node) + except: + # XXX + raise + node.parent = None + + def cloneNode(self): + raise NotImplementedError + + def hasContent(self): + """Return true if the node has children or text""" + return bool(self.childNodes) + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class Document(Node): + type = 1 + def __init__(self): + Node.__init__(self, None) + + def __str__(self): + return "#document" + + def __unicode__(self): + return str(self) + + def appendChild(self, child): + Node.appendChild(self, child) + + def toxml(self, encoding="utf=8"): + result = "" + for child in self.childNodes: + result += child.toxml() + return result.encode(encoding) + + def hilite(self, encoding="utf-8"): + result = "
    "
    +        for child in self.childNodes:
    +            result += child.hilite()
    +        return result.encode(encoding) + "
    " + + def printTree(self): + tree = unicode(self) + for child in self.childNodes: + tree += child.printTree(2) + return tree + + def cloneNode(self): + return Document() + +class DocumentFragment(Document): + type = 2 + def __str__(self): + return "#document-fragment" + + def __unicode__(self): + return str(self) + + def cloneNode(self): + return DocumentFragment() + +class DocumentType(Node): + type = 3 + def __init__(self, name, publicId, systemId): + Node.__init__(self, name) + self.publicId = publicId + self.systemId = systemId + + def __unicode__(self): + if self.publicId or self.systemId: + publicId = self.publicId or "" + systemId = self.systemId or "" + return """"""%( + self.name, publicId, systemId) + + else: + return u"" % self.name + + + toxml = __unicode__ + + def hilite(self): + return '<!DOCTYPE %s>' % self.name + + def cloneNode(self): + return DocumentType(self.name, self.publicId, self.systemId) + +class TextNode(Node): + type = 4 + def __init__(self, value): + Node.__init__(self, None) + self.value = value + + def __unicode__(self): + return u"\"%s\"" % self.value + + def toxml(self): + return escape(self.value) + + hilite = toxml + + def cloneNode(self): + return TextNode(self.value) + +class Element(Node): + type = 5 + def __init__(self, name, namespace=None): + Node.__init__(self, name) + self.namespace = namespace + self.attributes = {} + + def __unicode__(self): + if self.namespace == None: + return u"<%s>" % self.name + else: + return u"<%s %s>"%(prefixes[self.namespace], self.name) + + def toxml(self): + result = '<' + self.name + if self.attributes: + for name,value in self.attributes.iteritems(): + result += u' %s="%s"' % (name, escape(value,{'"':'"'})) + if self.childNodes: + result += '>' + for child in self.childNodes: + result += child.toxml() + result += u'' % self.name + else: + result += u'/>' + return result + + def hilite(self): + result = '<%s' % self.name + if self.attributes: + for name, value in self.attributes.iteritems(): + result += ' %s="%s"' % (name, escape(value, {'"':'"'})) + if self.childNodes: + result += ">" + for child in self.childNodes: + result += child.hilite() + elif self.name in voidElements: + return result + ">" + return result + '</%s>' % self.name + + def printTree(self, indent): + tree = '\n|%s%s' % (' '*indent, unicode(self)) + indent += 2 + if self.attributes: + for name, value in sorted(self.attributes.iteritems()): + if isinstance(name, tuple): + name = "%s %s"%(name[0], name[1]) + tree += '\n|%s%s="%s"' % (' ' * indent, name, value) + for child in self.childNodes: + tree += child.printTree(indent) + return tree + + def cloneNode(self): + newNode = Element(self.name) + if hasattr(self, 'namespace'): + newNode.namespace = self.namespace + for attr, value in self.attributes.iteritems(): + newNode.attributes[attr] = value + return newNode + +class CommentNode(Node): + type = 6 + def __init__(self, data): + Node.__init__(self, None) + self.data = data + + def __unicode__(self): + return "" % self.data + + def toxml(self): + return "" % self.data + + def hilite(self): + return '<!--%s-->' % escape(self.data) + + def cloneNode(self): + return CommentNode(self.data) + +class TreeBuilder(_base.TreeBuilder): + documentClass = Document + doctypeClass = DocumentType + elementClass = Element + commentClass = CommentNode + fragmentClass = DocumentFragment + + def testSerializer(self, node): + return node.printTree() diff --git a/libs/html5lib/treebuilders/soup.py b/libs/html5lib/treebuilders/soup.py new file mode 100644 index 0000000..9bc5ff0 --- /dev/null +++ b/libs/html5lib/treebuilders/soup.py @@ -0,0 +1,236 @@ +import warnings + +warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning) + +from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration + +import _base +from html5lib.constants import namespaces, DataLossWarning + +class AttrList(object): + def __init__(self, element): + self.element = element + self.attrs = dict(self.element.attrs) + def __iter__(self): + return self.attrs.items().__iter__() + def __setitem__(self, name, value): + "set attr", name, value + self.element[name] = value + def items(self): + return self.attrs.items() + def keys(self): + return self.attrs.keys() + def __getitem__(self, name): + return self.attrs[name] + def __contains__(self, name): + return name in self.attrs.keys() + def __eq__(self, other): + if len(self.keys()) != len(other.keys()): + return False + for item in self.keys(): + if item not in other: + return False + if self[item] != other[item]: + return False + return True + +class Element(_base.Node): + def __init__(self, element, soup, namespace): + _base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def _nodeIndex(self, node, refNode): + # Finds a node by identity rather than equality + for index in range(len(self.element.contents)): + if id(self.element.contents[index]) == id(refNode.element): + return index + return None + + def appendChild(self, node): + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[-1].__class__ == NavigableString): + # Concatenate new text onto old text node + # (TODO: This has O(n^2) performance, for input like "aaa...") + newStr = NavigableString(self.element.contents[-1]+node.element) + + # Remove the old text node + # (Can't simply use .extract() by itself, because it fails if + # an equal text node exists within the parent node) + oldElement = self.element.contents[-1] + del self.element.contents[-1] + oldElement.parent = None + oldElement.extract() + + self.element.insert(len(self.element.contents), newStr) + else: + self.element.insert(len(self.element.contents), node.element) + node.parent = self + + def getAttributes(self): + return AttrList(self.element) + + def setAttributes(self, attributes): + if attributes: + for name, value in attributes.items(): + self.element[name] = value + + attributes = property(getAttributes, setAttributes) + + def insertText(self, data, insertBefore=None): + text = TextNode(NavigableString(data), self.soup) + if insertBefore: + self.insertBefore(text, insertBefore) + else: + self.appendChild(text) + + def insertBefore(self, node, refNode): + index = self._nodeIndex(node, refNode) + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[index-1].__class__ == NavigableString): + # (See comments in appendChild) + newStr = NavigableString(self.element.contents[index-1]+node.element) + oldNode = self.element.contents[index-1] + del self.element.contents[index-1] + oldNode.parent = None + oldNode.extract() + + self.element.insert(index-1, newStr) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node): + index = self._nodeIndex(node.parent, node) + del node.parent.element.contents[index] + node.element.parent = None + node.element.extract() + node.parent = None + + def reparentChildren(self, newParent): + while self.element.contents: + child = self.element.contents[0] + child.extract() + if isinstance(child, Tag): + newParent.appendChild(Element(child, self.soup, namespaces["html"])) + else: + newParent.appendChild(TextNode(child, self.soup)) + + def cloneNode(self): + node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace) + for key,value in self.attributes: + node.attributes[key] = value + return node + + def hasContent(self): + return self.element.contents + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class TextNode(Element): + def __init__(self, element, soup): + _base.Node.__init__(self, None) + self.element = element + self.soup = soup + + def cloneNode(self): + raise NotImplementedError + +class TreeBuilder(_base.TreeBuilder): + def __init__(self, namespaceHTMLElements): + if namespaceHTMLElements: + warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) + _base.TreeBuilder.__init__(self, namespaceHTMLElements) + + def documentClass(self): + self.soup = BeautifulSoup("") + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + if publicId: + self.soup.insert(0, Declaration("DOCTYPE %s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or ""))) + elif systemId: + self.soup.insert(0, Declaration("DOCTYPE %s SYSTEM \"%s\""% + (name, systemId))) + else: + self.soup.insert(0, Declaration("DOCTYPE %s"%name)) + + def elementClass(self, name, namespace): + if namespace is not None: + warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) + return Element(Tag(self.soup, name), self.soup, namespace) + + def commentClass(self, data): + return TextNode(Comment(data), self.soup) + + def fragmentClass(self): + self.soup = BeautifulSoup("") + self.soup.name = "[document_fragment]" + return Element(self.soup, self.soup, None) + + def appendChild(self, node): + self.soup.insert(len(self.soup.contents), node.element) + + def testSerializer(self, element): + return testSerializer(element) + + def getDocument(self): + return self.soup + + def getFragment(self): + return _base.TreeBuilder.getFragment(self).element + +def testSerializer(element): + import re + rv = [] + def serializeElement(element, indent=0): + if isinstance(element, Declaration): + doctype_regexp = r'DOCTYPE\s+(?P[^\s]*)( PUBLIC "(?P.*)" "(?P.*)"| SYSTEM "(?P.*)")?' + m = re.compile(doctype_regexp).match(element.string) + assert m is not None, "DOCTYPE did not match expected format" + name = m.group('name') + publicId = m.group('publicId') + if publicId is not None: + systemId = m.group('systemId1') or "" + else: + systemId = m.group('systemId2') + + if publicId is not None or systemId is not None: + rv.append("""|%s"""% + (' '*indent, name, publicId or "", systemId or "")) + else: + rv.append("|%s"%(' '*indent, name)) + + elif isinstance(element, BeautifulSoup): + if element.name == "[document_fragment]": + rv.append("#document-fragment") + else: + rv.append("#document") + + elif isinstance(element, Comment): + rv.append("|%s"%(' '*indent, element.string)) + elif isinstance(element, unicode): + rv.append("|%s\"%s\"" %(' '*indent, element)) + else: + rv.append("|%s<%s>"%(' '*indent, element.name)) + if element.attrs: + for name, value in sorted(element.attrs): + rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) + indent += 2 + if hasattr(element, "contents"): + for child in element.contents: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) diff --git a/libs/html5lib/treewalkers/__init__.py b/libs/html5lib/treewalkers/__init__.py new file mode 100644 index 0000000..3a606a8 --- /dev/null +++ b/libs/html5lib/treewalkers/__init__.py @@ -0,0 +1,52 @@ +"""A collection of modules for iterating through different kinds of +tree, generating tokens identical to those produced by the tokenizer +module. + +To create a tree walker for a new type of tree, you need to do +implement a tree walker object (called TreeWalker by convention) that +implements a 'serialize' method taking a tree as sole argument and +returning an iterator generating tokens. +""" + +treeWalkerCache = {} + +def getTreeWalker(treeType, implementation=None, **kwargs): + """Get a TreeWalker class for various types of tree with built-in support + + treeType - the name of the tree type required (case-insensitive). Supported + values are "simpletree", "dom", "etree" and "beautifulsoup" + + "simpletree" - a built-in DOM-ish tree type with support for some + more pythonic idioms. + "dom" - The xml.dom.minidom DOM implementation + "pulldom" - The xml.dom.pulldom event stream + "etree" - A generic walker for tree implementations exposing an + elementtree-like interface (known to work with + ElementTree, cElementTree and lxml.etree). + "lxml" - Optimized walker for lxml.etree + "beautifulsoup" - Beautiful soup (if installed) + "genshi" - a Genshi stream + + implementation - (Currently applies to the "etree" tree type only). A module + implementing the tree type e.g. xml.etree.ElementTree or + cElementTree.""" + + treeType = treeType.lower() + if treeType not in treeWalkerCache: + if treeType in ("dom", "pulldom", "simpletree"): + mod = __import__(treeType, globals()) + treeWalkerCache[treeType] = mod.TreeWalker + elif treeType == "genshi": + import genshistream + treeWalkerCache[treeType] = genshistream.TreeWalker + elif treeType == "beautifulsoup": + import soup + treeWalkerCache[treeType] = soup.TreeWalker + elif treeType == "lxml": + import lxmletree + treeWalkerCache[treeType] = lxmletree.TreeWalker + elif treeType == "etree": + import etree + # XXX: NEVER cache here, caching is done in the etree submodule + return etree.getETreeModule(implementation, **kwargs).TreeWalker + return treeWalkerCache.get(treeType) diff --git a/libs/html5lib/treewalkers/_base.py b/libs/html5lib/treewalkers/_base.py new file mode 100644 index 0000000..5929ba0 --- /dev/null +++ b/libs/html5lib/treewalkers/_base.py @@ -0,0 +1,176 @@ +import gettext +_ = gettext.gettext + +from html5lib.constants import voidElements, spaceCharacters +spaceCharacters = u"".join(spaceCharacters) + +class TreeWalker(object): + def __init__(self, tree): + self.tree = tree + + def __iter__(self): + raise NotImplementedError + + def error(self, msg): + return {"type": "SerializeError", "data": msg} + + def normalizeAttrs(self, attrs): + newattrs = {} + if attrs: + #TODO: treewalkers should always have attrs + for (namespace,name),value in attrs.iteritems(): + namespace = unicode(namespace) if namespace else None + name = unicode(name) + value = unicode(value) + newattrs[(namespace,name)] = value + return newattrs + + def emptyTag(self, namespace, name, attrs, hasChildren=False): + yield {"type": "EmptyTag", "name": unicode(name), + "namespace":unicode(namespace), + "data": self.normalizeAttrs(attrs)} + if hasChildren: + yield self.error(_("Void element has children")) + + def startTag(self, namespace, name, attrs): + return {"type": "StartTag", + "name": unicode(name), + "namespace":unicode(namespace), + "data": self.normalizeAttrs(attrs)} + + def endTag(self, namespace, name): + return {"type": "EndTag", + "name": unicode(name), + "namespace":unicode(namespace), + "data": {}} + + def text(self, data): + data = unicode(data) + middle = data.lstrip(spaceCharacters) + left = data[:len(data)-len(middle)] + if left: + yield {"type": "SpaceCharacters", "data": left} + data = middle + middle = data.rstrip(spaceCharacters) + right = data[len(middle):] + if middle: + yield {"type": "Characters", "data": middle} + if right: + yield {"type": "SpaceCharacters", "data": right} + + def comment(self, data): + return {"type": "Comment", "data": unicode(data)} + + def doctype(self, name, publicId=None, systemId=None, correct=True): + return {"type": "Doctype", + "name": name is not None and unicode(name) or u"", + "publicId": publicId, + "systemId": systemId, + "correct": correct} + + def entity(self, name): + return {"type": "Entity", "name": unicode(name)} + + def unknown(self, nodeType): + return self.error(_("Unknown node type: ") + nodeType) + +class RecursiveTreeWalker(TreeWalker): + def walkChildren(self, node): + raise NodeImplementedError + + def element(self, node, namespace, name, attrs, hasChildren): + if name in voidElements: + for token in self.emptyTag(namespace, name, attrs, hasChildren): + yield token + else: + yield self.startTag(name, attrs) + if hasChildren: + for token in self.walkChildren(node): + yield token + yield self.endTag(name) + +from xml.dom import Node + +DOCUMENT = Node.DOCUMENT_NODE +DOCTYPE = Node.DOCUMENT_TYPE_NODE +TEXT = Node.TEXT_NODE +ELEMENT = Node.ELEMENT_NODE +COMMENT = Node.COMMENT_NODE +ENTITY = Node.ENTITY_NODE +UNKNOWN = "<#UNKNOWN#>" + +class NonRecursiveTreeWalker(TreeWalker): + def getNodeDetails(self, node): + raise NotImplementedError + + def getFirstChild(self, node): + raise NotImplementedError + + def getNextSibling(self, node): + raise NotImplementedError + + def getParentNode(self, node): + raise NotImplementedError + + def __iter__(self): + currentNode = self.tree + while currentNode is not None: + details = self.getNodeDetails(currentNode) + type, details = details[0], details[1:] + hasChildren = False + endTag = None + + if type == DOCTYPE: + yield self.doctype(*details) + + elif type == TEXT: + for token in self.text(*details): + yield token + + elif type == ELEMENT: + namespace, name, attributes, hasChildren = details + if name in voidElements: + for token in self.emptyTag(namespace, name, attributes, + hasChildren): + yield token + hasChildren = False + else: + endTag = name + yield self.startTag(namespace, name, attributes) + + elif type == COMMENT: + yield self.comment(details[0]) + + elif type == ENTITY: + yield self.entity(details[0]) + + elif type == DOCUMENT: + hasChildren = True + + else: + yield self.unknown(details[0]) + + if hasChildren: + firstChild = self.getFirstChild(currentNode) + else: + firstChild = None + + if firstChild is not None: + currentNode = firstChild + else: + while currentNode is not None: + details = self.getNodeDetails(currentNode) + type, details = details[0], details[1:] + if type == ELEMENT: + namespace, name, attributes, hasChildren = details + if name not in voidElements: + yield self.endTag(namespace, name) + if self.tree is currentNode: + currentNode = None + break + nextSibling = self.getNextSibling(currentNode) + if nextSibling is not None: + currentNode = nextSibling + break + else: + currentNode = self.getParentNode(currentNode) diff --git a/libs/html5lib/treewalkers/dom.py b/libs/html5lib/treewalkers/dom.py new file mode 100644 index 0000000..383b46c --- /dev/null +++ b/libs/html5lib/treewalkers/dom.py @@ -0,0 +1,41 @@ +from xml.dom import Node + +import gettext +_ = gettext.gettext + +import _base +from html5lib.constants import voidElements + +class TreeWalker(_base.NonRecursiveTreeWalker): + def getNodeDetails(self, node): + if node.nodeType == Node.DOCUMENT_TYPE_NODE: + return _base.DOCTYPE, node.name, node.publicId, node.systemId + + elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE): + return _base.TEXT, node.nodeValue + + elif node.nodeType == Node.ELEMENT_NODE: + attrs = {} + for attr in node.attributes.keys(): + attr = node.getAttributeNode(attr) + attrs[(attr.namespaceURI,attr.localName)] = attr.value + return (_base.ELEMENT, node.namespaceURI, node.nodeName, + attrs, node.hasChildNodes()) + + elif node.nodeType == Node.COMMENT_NODE: + return _base.COMMENT, node.nodeValue + + elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE): + return (_base.DOCUMENT,) + + else: + return _base.UNKNOWN, node.nodeType + + def getFirstChild(self, node): + return node.firstChild + + def getNextSibling(self, node): + return node.nextSibling + + def getParentNode(self, node): + return node.parentNode diff --git a/libs/html5lib/treewalkers/etree.py b/libs/html5lib/treewalkers/etree.py new file mode 100644 index 0000000..13b0319 --- /dev/null +++ b/libs/html5lib/treewalkers/etree.py @@ -0,0 +1,141 @@ +import gettext +_ = gettext.gettext + +try: + from types import ModuleType +except: + from new import module as ModuleType +import copy +import re + +import _base +from html5lib.constants import voidElements + +tag_regexp = re.compile("{([^}]*)}(.*)") + +moduleCache = {} + +def getETreeModule(ElementTreeImplementation): + name = "_" + ElementTreeImplementation.__name__+"builder" + if name in moduleCache: + return moduleCache[name] + else: + mod = ModuleType("_" + ElementTreeImplementation.__name__+"builder") + objs = getETreeBuilder(ElementTreeImplementation) + mod.__dict__.update(objs) + moduleCache[name] = mod + return mod + +def getETreeBuilder(ElementTreeImplementation): + ElementTree = ElementTreeImplementation + + class TreeWalker(_base.NonRecursiveTreeWalker): + """Given the particular ElementTree representation, this implementation, + to avoid using recursion, returns "nodes" as tuples with the following + content: + + 1. The current element + + 2. The index of the element relative to its parent + + 3. A stack of ancestor elements + + 4. A flag "text", "tail" or None to indicate if the current node is a + text node; either the text or tail of the current element (1) + """ + def getNodeDetails(self, node): + if isinstance(node, tuple): # It might be the root Element + elt, key, parents, flag = node + if flag in ("text", "tail"): + return _base.TEXT, getattr(elt, flag) + else: + node = elt + + if not(hasattr(node, "tag")): + node = node.getroot() + + if node.tag in ("", ""): + return (_base.DOCUMENT,) + + elif node.tag == "": + return (_base.DOCTYPE, node.text, + node.get("publicId"), node.get("systemId")) + + elif node.tag == ElementTree.Comment: + return _base.COMMENT, node.text + + else: + assert type(node.tag) in (str, unicode), type(node.tag) + #This is assumed to be an ordinary element + match = tag_regexp.match(node.tag) + if match: + namespace, tag = match.groups() + else: + namespace = None + tag = node.tag + attrs = {} + for name, value in node.attrib.items(): + match = tag_regexp.match(name) + if match: + attrs[(match.group(1),match.group(2))] = value + else: + attrs[(None,name)] = value + return (_base.ELEMENT, namespace, tag, + attrs, len(node) or node.text) + + def getFirstChild(self, node): + if isinstance(node, tuple): + element, key, parents, flag = node + else: + element, key, parents, flag = node, None, [], None + + if flag in ("text", "tail"): + return None + else: + if element.text: + return element, key, parents, "text" + elif len(element): + parents.append(element) + return element[0], 0, parents, None + else: + return None + + def getNextSibling(self, node): + if isinstance(node, tuple): + element, key, parents, flag = node + else: + return None + + if flag == "text": + if len(element): + parents.append(element) + return element[0], 0, parents, None + else: + return None + else: + if element.tail and flag != "tail": + return element, key, parents, "tail" + elif key < len(parents[-1]) - 1: + return parents[-1][key+1], key+1, parents, None + else: + return None + + def getParentNode(self, node): + if isinstance(node, tuple): + element, key, parents, flag = node + else: + return None + + if flag == "text": + if not parents: + return element + else: + return element, key, parents, None + else: + parent = parents.pop() + if not parents: + return parent + else: + return parent, list(parents[-1]).index(parent), parents, None + + return locals() diff --git a/libs/html5lib/treewalkers/genshistream.py b/libs/html5lib/treewalkers/genshistream.py new file mode 100644 index 0000000..ef71a83 --- /dev/null +++ b/libs/html5lib/treewalkers/genshistream.py @@ -0,0 +1,70 @@ +from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT +from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT +from genshi.output import NamespaceFlattener + +import _base + +from html5lib.constants import voidElements + +class TreeWalker(_base.TreeWalker): + def __iter__(self): + depth = 0 + ignore_until = None + previous = None + for event in self.tree: + if previous is not None: + if previous[0] == START: + depth += 1 + if ignore_until <= depth: + ignore_until = None + if ignore_until is None: + for token in self.tokens(previous, event): + yield token + if token["type"] == "EmptyTag": + ignore_until = depth + if previous[0] == END: + depth -= 1 + previous = event + if previous is not None: + if ignore_until is None or ignore_until <= depth: + for token in self.tokens(previous, None): + yield token + elif ignore_until is not None: + raise ValueError("Illformed DOM event stream: void element without END_ELEMENT") + + def tokens(self, event, next): + kind, data, pos = event + if kind == START: + tag, attrib = data + name = tag.localname + namespace = tag.namespace + if tag in voidElements: + for token in self.emptyTag(namespace, name, list(attrib), + not next or next[0] != END + or next[1] != tag): + yield token + else: + yield self.startTag(namespace, name, list(attrib)) + + elif kind == END: + name = data.localname + namespace = data.namespace + if name not in voidElements: + yield self.endTag(namespace, name) + + elif kind == COMMENT: + yield self.comment(data) + + elif kind == TEXT: + for token in self.text(data): + yield token + + elif kind == DOCTYPE: + yield self.doctype(*data) + + elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \ + START_CDATA, END_CDATA, PI): + pass + + else: + yield self.unknown(kind) diff --git a/libs/html5lib/treewalkers/lxmletree.py b/libs/html5lib/treewalkers/lxmletree.py new file mode 100644 index 0000000..5f32805 --- /dev/null +++ b/libs/html5lib/treewalkers/lxmletree.py @@ -0,0 +1,186 @@ +from lxml import etree +from html5lib.treebuilders.etree import tag_regexp + +from gettext import gettext +_ = gettext + +import _base + +from html5lib.constants import voidElements +from html5lib import ihatexml + +class Root(object): + def __init__(self, et): + self.elementtree = et + self.children = [] + if et.docinfo.internalDTD: + self.children.append(Doctype(self, et.docinfo.root_name, + et.docinfo.public_id, + et.docinfo.system_url)) + root = et.getroot() + node = root + + while node.getprevious() is not None: + node = node.getprevious() + while node is not None: + self.children.append(node) + node = node.getnext() + + self.text = None + self.tail = None + + def __getitem__(self, key): + return self.children[key] + + def getnext(self): + return None + + def __len__(self): + return 1 + +class Doctype(object): + def __init__(self, root_node, name, public_id, system_id): + self.root_node = root_node + self.name = name + self.public_id = public_id + self.system_id = system_id + + self.text = None + self.tail = None + + def getnext(self): + return self.root_node.children[1] + +class FragmentRoot(Root): + def __init__(self, children): + self.children = [FragmentWrapper(self, child) for child in children] + self.text = self.tail = None + + def getnext(self): + return None + +class FragmentWrapper(object): + def __init__(self, fragment_root, obj): + self.root_node = fragment_root + self.obj = obj + if hasattr(self.obj, 'text'): + self.text = self.obj.text + else: + self.text = None + if hasattr(self.obj, 'tail'): + self.tail = self.obj.tail + else: + self.tail = None + self.isstring = isinstance(obj, basestring) + + def __getattr__(self, name): + return getattr(self.obj, name) + + def getnext(self): + siblings = self.root_node.children + idx = siblings.index(self) + if idx < len(siblings) - 1: + return siblings[idx + 1] + else: + return None + + def __getitem__(self, key): + return self.obj[key] + + def __nonzero__(self): + return bool(self.obj) + + def getparent(self): + return None + + def __str__(self): + return str(self.obj) + + def __unicode__(self): + return unicode(self.obj) + + def __len__(self): + return len(self.obj) + + +class TreeWalker(_base.NonRecursiveTreeWalker): + def __init__(self, tree): + if hasattr(tree, "getroot"): + tree = Root(tree) + elif isinstance(tree, list): + tree = FragmentRoot(tree) + _base.NonRecursiveTreeWalker.__init__(self, tree) + self.filter = ihatexml.InfosetFilter() + def getNodeDetails(self, node): + if isinstance(node, tuple): # Text node + node, key = node + assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + return _base.TEXT, getattr(node, key) + + elif isinstance(node, Root): + return (_base.DOCUMENT,) + + elif isinstance(node, Doctype): + return _base.DOCTYPE, node.name, node.public_id, node.system_id + + elif isinstance(node, FragmentWrapper) and node.isstring: + return _base.TEXT, node + + elif node.tag == etree.Comment: + return _base.COMMENT, node.text + + elif node.tag == etree.Entity: + return _base.ENTITY, node.text[1:-1] # strip &; + + else: + #This is assumed to be an ordinary element + match = tag_regexp.match(node.tag) + if match: + namespace, tag = match.groups() + else: + namespace = None + tag = node.tag + attrs = {} + for name, value in node.attrib.items(): + match = tag_regexp.match(name) + if match: + attrs[(match.group(1),match.group(2))] = value + else: + attrs[(None,name)] = value + return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag), + attrs, len(node) > 0 or node.text) + + def getFirstChild(self, node): + assert not isinstance(node, tuple), _("Text nodes have no children") + + assert len(node) or node.text, "Node has no children" + if node.text: + return (node, "text") + else: + return node[0] + + def getNextSibling(self, node): + if isinstance(node, tuple): # Text node + node, key = node + assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + if key == "text": + # XXX: we cannot use a "bool(node) and node[0] or None" construct here + # because node[0] might evaluate to False if it has no child element + if len(node): + return node[0] + else: + return None + else: # tail + return node.getnext() + + return node.tail and (node, "tail") or node.getnext() + + def getParentNode(self, node): + if isinstance(node, tuple): # Text node + node, key = node + assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + if key == "text": + return node + # else: fallback to "normal" processing + + return node.getparent() diff --git a/libs/html5lib/treewalkers/pulldom.py b/libs/html5lib/treewalkers/pulldom.py new file mode 100644 index 0000000..1f8b95b --- /dev/null +++ b/libs/html5lib/treewalkers/pulldom.py @@ -0,0 +1,60 @@ +from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \ + COMMENT, IGNORABLE_WHITESPACE, CHARACTERS + +import _base + +from html5lib.constants import voidElements + +class TreeWalker(_base.TreeWalker): + def __iter__(self): + ignore_until = None + previous = None + for event in self.tree: + if previous is not None and \ + (ignore_until is None or previous[1] is ignore_until): + if previous[1] is ignore_until: + ignore_until = None + for token in self.tokens(previous, event): + yield token + if token["type"] == "EmptyTag": + ignore_until = previous[1] + previous = event + if ignore_until is None or previous[1] is ignore_until: + for token in self.tokens(previous, None): + yield token + elif ignore_until is not None: + raise ValueError("Illformed DOM event stream: void element without END_ELEMENT") + + def tokens(self, event, next): + type, node = event + if type == START_ELEMENT: + name = node.nodeName + namespace = node.namespaceURI + attrs = {} + for attr in node.attributes.keys(): + attr = node.getAttributeNode(attr) + attrs[(attr.namespaceURI,attr.localName)] = attr.value + if name in voidElements: + for token in self.emptyTag(namespace, + name, + attrs, + not next or next[1] is not node): + yield token + else: + yield self.startTag(namespace, name, attrs) + + elif type == END_ELEMENT: + name = node.nodeName + namespace = node.namespaceURI + if name not in voidElements: + yield self.endTag(namespace, name) + + elif type == COMMENT: + yield self.comment(node.nodeValue) + + elif type in (IGNORABLE_WHITESPACE, CHARACTERS): + for token in self.text(node.nodeValue): + yield token + + else: + yield self.unknown(type) diff --git a/libs/html5lib/treewalkers/simpletree.py b/libs/html5lib/treewalkers/simpletree.py new file mode 100644 index 0000000..9e6bd4c --- /dev/null +++ b/libs/html5lib/treewalkers/simpletree.py @@ -0,0 +1,78 @@ +import gettext +_ = gettext.gettext + +import _base + +class TreeWalker(_base.NonRecursiveTreeWalker): + """Given that simpletree has no performant way of getting a node's + next sibling, this implementation returns "nodes" as tuples with the + following content: + + 1. The parent Node (Element, Document or DocumentFragment) + + 2. The child index of the current node in its parent's children list + + 3. A list used as a stack of all ancestors. It is a pair tuple whose + first item is a parent Node and second item is a child index. + """ + + def getNodeDetails(self, node): + if isinstance(node, tuple): # It might be the root Node + parent, idx, parents = node + node = parent.childNodes[idx] + + # testing node.type allows us not to import treebuilders.simpletree + if node.type in (1, 2): # Document or DocumentFragment + return (_base.DOCUMENT,) + + elif node.type == 3: # DocumentType + return _base.DOCTYPE, node.name, node.publicId, node.systemId + + elif node.type == 4: # TextNode + return _base.TEXT, node.value + + elif node.type == 5: # Element + attrs = {} + for name, value in node.attributes.items(): + if isinstance(name, tuple): + attrs[(name[2],name[1])] = value + else: + attrs[(None,name)] = value + return (_base.ELEMENT, node.namespace, node.name, + attrs, node.hasContent()) + + elif node.type == 6: # CommentNode + return _base.COMMENT, node.data + + else: + return _node.UNKNOWN, node.type + + def getFirstChild(self, node): + if isinstance(node, tuple): # It might be the root Node + parent, idx, parents = node + parents.append((parent, idx)) + node = parent.childNodes[idx] + else: + parents = [] + + assert node.hasContent(), "Node has no children" + return (node, 0, parents) + + def getNextSibling(self, node): + assert isinstance(node, tuple), "Node is not a tuple: " + str(node) + parent, idx, parents = node + idx += 1 + if len(parent.childNodes) > idx: + return (parent, idx, parents) + else: + return None + + def getParentNode(self, node): + assert isinstance(node, tuple) + parent, idx, parents = node + if parents: + parent, idx = parents.pop() + return parent, idx, parents + else: + # HACK: We could return ``parent`` but None will stop the algorithm the same way + return None diff --git a/libs/html5lib/treewalkers/soup.py b/libs/html5lib/treewalkers/soup.py new file mode 100644 index 0000000..fca65ec --- /dev/null +++ b/libs/html5lib/treewalkers/soup.py @@ -0,0 +1,60 @@ +import re +import gettext +_ = gettext.gettext + +from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag +from html5lib.constants import namespaces +import _base + +class TreeWalker(_base.NonRecursiveTreeWalker): + doctype_regexp = re.compile( + r'DOCTYPE\s+(?P[^\s]*)(\s*PUBLIC\s*"(?P.*)"\s*"(?P.*)"|\s*SYSTEM\s*"(?P.*)")?') + def getNodeDetails(self, node): + if isinstance(node, BeautifulSoup): # Document or DocumentFragment + return (_base.DOCUMENT,) + + elif isinstance(node, Declaration): # DocumentType + string = unicode(node.string) + #Slice needed to remove markup added during unicode conversion, + #but only in some versions of BeautifulSoup/Python + if string.startswith(''): + string = string[2:-1] + m = self.doctype_regexp.match(string) + #This regexp approach seems wrong and fragile + #but beautiful soup stores the doctype as a single thing and we want the seperate bits + #It should work as long as the tree is created by html5lib itself but may be wrong if it's + #been modified at all + #We could just feed to it a html5lib tokenizer, I guess... + assert m is not None, "DOCTYPE did not match expected format" + + name = m.group('name') + publicId = m.group('publicId') + if publicId is not None: + systemId = m.group('systemId1') + else: + systemId = m.group('systemId2') + return _base.DOCTYPE, name, publicId or "", systemId or "" + + elif isinstance(node, Comment): + string = unicode(node.string) + if string.startswith(''): + string = string[4:-3] + return _base.COMMENT, string + + elif isinstance(node, unicode): # TextNode + return _base.TEXT, node + + elif isinstance(node, Tag): # Element + return (_base.ELEMENT, namespaces["html"], node.name, + dict(node.attrs).items(), node.contents) + else: + return _base.UNKNOWN, node.__class__.__name__ + + def getFirstChild(self, node): + return node.contents[0] + + def getNextSibling(self, node): + return node.nextSibling + + def getParentNode(self, node): + return node.parent diff --git a/libs/html5lib/utils.py b/libs/html5lib/utils.py new file mode 100644 index 0000000..d53f678 --- /dev/null +++ b/libs/html5lib/utils.py @@ -0,0 +1,175 @@ +try: + frozenset +except NameError: + #Import from the sets module for python 2.3 + from sets import Set as set + from sets import ImmutableSet as frozenset + +class MethodDispatcher(dict): + """Dict with 2 special properties: + + On initiation, keys that are lists, sets or tuples are converted to + multiple keys so accessing any one of the items in the original + list-like object returns the matching value + + md = MethodDispatcher({("foo", "bar"):"baz"}) + md["foo"] == "baz" + + A default value which can be set through the default attribute. + """ + + def __init__(self, items=()): + # Using _dictEntries instead of directly assigning to self is about + # twice as fast. Please do careful performance testing before changing + # anything here. + _dictEntries = [] + for name,value in items: + if type(name) in (list, tuple, frozenset, set): + for item in name: + _dictEntries.append((item, value)) + else: + _dictEntries.append((name, value)) + dict.__init__(self, _dictEntries) + self.default = None + + def __getitem__(self, key): + return dict.get(self, key, self.default) + +#Pure python implementation of deque taken from the ASPN Python Cookbook +#Original code by Raymond Hettinger + +class deque(object): + + def __init__(self, iterable=(), maxsize=-1): + if not hasattr(self, 'data'): + self.left = self.right = 0 + self.data = {} + self.maxsize = maxsize + self.extend(iterable) + + def append(self, x): + self.data[self.right] = x + self.right += 1 + if self.maxsize != -1 and len(self) > self.maxsize: + self.popleft() + + def appendleft(self, x): + self.left -= 1 + self.data[self.left] = x + if self.maxsize != -1 and len(self) > self.maxsize: + self.pop() + + def pop(self): + if self.left == self.right: + raise IndexError('cannot pop from empty deque') + self.right -= 1 + elem = self.data[self.right] + del self.data[self.right] + return elem + + def popleft(self): + if self.left == self.right: + raise IndexError('cannot pop from empty deque') + elem = self.data[self.left] + del self.data[self.left] + self.left += 1 + return elem + + def clear(self): + self.data.clear() + self.left = self.right = 0 + + def extend(self, iterable): + for elem in iterable: + self.append(elem) + + def extendleft(self, iterable): + for elem in iterable: + self.appendleft(elem) + + def rotate(self, n=1): + if self: + n %= len(self) + for i in xrange(n): + self.appendleft(self.pop()) + + def __getitem__(self, i): + if i < 0: + i += len(self) + try: + return self.data[i + self.left] + except KeyError: + raise IndexError + + def __setitem__(self, i, value): + if i < 0: + i += len(self) + try: + self.data[i + self.left] = value + except KeyError: + raise IndexError + + def __delitem__(self, i): + size = len(self) + if not (-size <= i < size): + raise IndexError + data = self.data + if i < 0: + i += size + for j in xrange(self.left+i, self.right-1): + data[j] = data[j+1] + self.pop() + + def __len__(self): + return self.right - self.left + + def __cmp__(self, other): + if type(self) != type(other): + return cmp(type(self), type(other)) + return cmp(list(self), list(other)) + + def __repr__(self, _track=[]): + if id(self) in _track: + return '...' + _track.append(id(self)) + r = 'deque(%r)' % (list(self),) + _track.remove(id(self)) + return r + + def __getstate__(self): + return (tuple(self),) + + def __setstate__(self, s): + self.__init__(s[0]) + + def __hash__(self): + raise TypeError + + def __copy__(self): + return self.__class__(self) + + def __deepcopy__(self, memo={}): + from copy import deepcopy + result = self.__class__() + memo[id(self)] = result + result.__init__(deepcopy(tuple(self), memo)) + return result + +#Some utility functions to dal with weirdness around UCS2 vs UCS4 +#python builds + +def encodingType(): + if len() == 2: + return "UCS2" + else: + return "UCS4" + +def isSurrogatePair(data): + return (len(data) == 2 and + ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and + ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF) + +def surrogatePairToCodepoint(data): + char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 + + (ord(data[1]) - 0xDC00)) + return char_val diff --git a/libs/oauthlib/__init__.py b/libs/oauthlib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/oauthlib/common.py b/libs/oauthlib/common.py new file mode 100644 index 0000000..4cdfd0d --- /dev/null +++ b/libs/oauthlib/common.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +""" +oauthlib.common +~~~~~~~~~~~~~~ + +This module provides data structures and utilities common +to all implementations of OAuth. +""" + +import re +import urllib +import urlparse + + +always_safe = (u'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + u'abcdefghijklmnopqrstuvwxyz' + u'0123456789' u'_.-') + + +def quote(s, safe=u'/'): + encoded = s.encode("utf-8") + quoted = urllib.quote(encoded, safe) + return quoted.decode("utf-8") + + +def unquote(s): + encoded = s.encode("utf-8") + unquoted = urllib.unquote(encoded) + return unquoted.decode("utf-8") + + +def urlencode(params): + utf8_params = encode_params_utf8(params) + urlencoded = urllib.urlencode(utf8_params) + return urlencoded.decode("utf-8") + + +def encode_params_utf8(params): + """Ensures that all parameters in a list of 2-element tuples are encoded to + bytestrings using UTF-8 + """ + encoded = [] + for k, v in params: + encoded.append(( + k.encode('utf-8') if isinstance(k, unicode) else k, + v.encode('utf-8') if isinstance(v, unicode) else v)) + return encoded + + +def decode_params_utf8(params): + """Ensures that all parameters in a list of 2-element tuples are decoded to + unicode using UTF-8. + """ + decoded = [] + for k, v in params: + decoded.append(( + k.decode('utf-8') if isinstance(k, str) else k, + v.decode('utf-8') if isinstance(v, str) else v)) + return decoded + + +urlencoded = set(always_safe) | set(u'=&;%+~') + + +def urldecode(query): + """Decode a query string in x-www-form-urlencoded format into a sequence + of two-element tuples. + + Unlike urlparse.parse_qsl(..., strict_parsing=True) urldecode will enforce + correct formatting of the query string by validation. If validation fails + a ValueError will be raised. urllib.parse_qsl will only raise errors if + any of name-value pairs omits the equals sign. + """ + # Check if query contains invalid characters + if query and not set(query) <= urlencoded: + raise ValueError('Invalid characters in query string.') + + # Check for correctly hex encoded values using a regular expression + # All encoded values begin with % followed by two hex characters + # correct = %00, %A0, %0A, %FF + # invalid = %G0, %5H, %PO + invalid_hex = u'%[^0-9A-Fa-f]|%[0-9A-Fa-f][^0-9A-Fa-f]' + if len(re.findall(invalid_hex, query)): + raise ValueError('Invalid hex encoding in query string.') + + query = query.decode('utf-8') if isinstance(query, str) else query + # We want to allow queries such as "c2" whereas urlparse.parse_qsl + # with the strict_parsing flag will not. + params = urlparse.parse_qsl(query, keep_blank_values=True) + + # unicode all the things + return decode_params_utf8(params) + + +def extract_params(raw): + """Extract parameters and return them as a list of 2-tuples. + + Will successfully extract parameters from urlencoded query strings, + dicts, or lists of 2-tuples. Empty strings/dicts/lists will return an + empty list of parameters. Any other input will result in a return + value of None. + """ + if isinstance(raw, basestring): + try: + params = urldecode(raw) + except ValueError: + params = None + elif hasattr(raw, '__iter__'): + try: + dict(raw) + except ValueError: + params = None + except TypeError: + params = None + else: + params = list(raw.items() if isinstance(raw, dict) else raw) + params = decode_params_utf8(params) + else: + params = None + + return params + + +class Request(object): + """A malleable representation of a signable HTTP request. + + Body argument may contain any data, but parameters will only be decoded if + they are one of: + + * urlencoded query string + * dict + * list of 2-tuples + + Anything else will be treated as raw body data to be passed through + unmolested. + """ + + def __init__(self, uri, http_method=u'GET', body=None, headers=None): + self.uri = uri + self.http_method = http_method + self.headers = headers or {} + self.body = body + self.decoded_body = extract_params(body) + self.oauth_params = [] + + @property + def uri_query(self): + return urlparse.urlparse(self.uri).query + + @property + def uri_query_params(self): + return urlparse.parse_qsl(self.uri_query, keep_blank_values=True, + strict_parsing=True) diff --git a/libs/oauthlib/oauth1/__init__.py b/libs/oauthlib/oauth1/__init__.py new file mode 100644 index 0000000..ef692b5 --- /dev/null +++ b/libs/oauthlib/oauth1/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +""" +oauthlib.oauth1 +~~~~~~~~~~~~~~ + +This module is a wrapper for the most recent implementation of OAuth 1.0 Client +and Server classes. +""" + +from .rfc5849 import Client, Server + diff --git a/libs/oauthlib/oauth1/rfc5849/__init__.py b/libs/oauthlib/oauth1/rfc5849/__init__.py new file mode 100644 index 0000000..03fb8b2 --- /dev/null +++ b/libs/oauthlib/oauth1/rfc5849/__init__.py @@ -0,0 +1,350 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +""" +oauthlib.oauth1.rfc5849 +~~~~~~~~~~~~~~ + +This module is an implementation of various logic needed +for signing and checking OAuth 1.0 RFC 5849 requests. +""" + +import logging +import urlparse + +from oauthlib.common import Request, urlencode +from . import parameters, signature, utils + +SIGNATURE_HMAC = u"HMAC-SHA1" +SIGNATURE_RSA = u"RSA-SHA1" +SIGNATURE_PLAINTEXT = u"PLAINTEXT" +SIGNATURE_METHODS = (SIGNATURE_HMAC, SIGNATURE_RSA, SIGNATURE_PLAINTEXT) + +SIGNATURE_TYPE_AUTH_HEADER = u'AUTH_HEADER' +SIGNATURE_TYPE_QUERY = u'QUERY' +SIGNATURE_TYPE_BODY = u'BODY' + +CONTENT_TYPE_FORM_URLENCODED = u'application/x-www-form-urlencoded' + + +class Client(object): + """A client used to sign OAuth 1.0 RFC 5849 requests""" + def __init__(self, client_key, + client_secret=None, + resource_owner_key=None, + resource_owner_secret=None, + callback_uri=None, + signature_method=SIGNATURE_HMAC, + signature_type=SIGNATURE_TYPE_AUTH_HEADER, + rsa_key=None, verifier=None): + self.client_key = client_key + self.client_secret = client_secret + self.resource_owner_key = resource_owner_key + self.resource_owner_secret = resource_owner_secret + self.signature_method = signature_method + self.signature_type = signature_type + self.callback_uri = callback_uri + self.rsa_key = rsa_key + self.verifier = verifier + + if self.signature_method == SIGNATURE_RSA and self.rsa_key is None: + raise ValueError('rsa_key is required when using RSA signature method.') + + def get_oauth_signature(self, request): + """Get an OAuth signature to be used in signing a request + """ + if self.signature_method == SIGNATURE_PLAINTEXT: + # fast-path + return signature.sign_plaintext(self.client_secret, + self.resource_owner_secret) + + uri, headers, body = self._render(request) + + collected_params = signature.collect_parameters( + uri_query=urlparse.urlparse(uri).query, + body=body, + headers=headers) + logging.debug("Collected params: {0}".format(collected_params)) + + normalized_params = signature.normalize_parameters(collected_params) + normalized_uri = signature.normalize_base_string_uri(request.uri) + logging.debug("Normalized params: {0}".format(normalized_params)) + logging.debug("Normalized URI: {0}".format(normalized_uri)) + + base_string = signature.construct_base_string(request.http_method, + normalized_uri, normalized_params) + + logging.debug("Base signing string: {0}".format(base_string)) + + if self.signature_method == SIGNATURE_HMAC: + sig = signature.sign_hmac_sha1(base_string, self.client_secret, + self.resource_owner_secret) + elif self.signature_method == SIGNATURE_RSA: + sig = signature.sign_rsa_sha1(base_string, self.rsa_key) + else: + sig = signature.sign_plaintext(self.client_secret, + self.resource_owner_secret) + + logging.debug("Signature: {0}".format(sig)) + return sig + + def get_oauth_params(self): + """Get the basic OAuth parameters to be used in generating a signature. + """ + params = [ + (u'oauth_nonce', utils.generate_nonce()), + (u'oauth_timestamp', utils.generate_timestamp()), + (u'oauth_version', u'1.0'), + (u'oauth_signature_method', self.signature_method), + (u'oauth_consumer_key', self.client_key), + ] + if self.resource_owner_key: + params.append((u'oauth_token', self.resource_owner_key)) + if self.callback_uri: + params.append((u'oauth_callback', self.callback_uri)) + if self.verifier: + params.append((u'oauth_verifier', self.verifier)) + + return params + + def _render(self, request, formencode=False): + """Render a signed request according to signature type + + Returns a 3-tuple containing the request URI, headers, and body. + + If the formencode argument is True and the body contains parameters, it + is escaped and returned as a valid formencoded string. + """ + # TODO what if there are body params on a header-type auth? + # TODO what if there are query params on a body-type auth? + + uri, headers, body = request.uri, request.headers, request.body + + # TODO: right now these prepare_* methods are very narrow in scope--they + # only affect their little thing. In some cases (for example, with + # header auth) it might be advantageous to allow these methods to touch + # other parts of the request, like the headers—so the prepare_headers + # method could also set the Content-Type header to x-www-form-urlencoded + # like the spec requires. This would be a fundamental change though, and + # I'm not sure how I feel about it. + if self.signature_type == SIGNATURE_TYPE_AUTH_HEADER: + headers = parameters.prepare_headers(request.oauth_params, request.headers) + elif self.signature_type == SIGNATURE_TYPE_BODY and request.decoded_body is not None: + body = parameters.prepare_form_encoded_body(request.oauth_params, request.decoded_body) + if formencode: + body = urlencode(body) + headers['Content-Type'] = u'application/x-www-form-urlencoded' + elif self.signature_type == SIGNATURE_TYPE_QUERY: + uri = parameters.prepare_request_uri_query(request.oauth_params, request.uri) + else: + raise ValueError('Unknown signature type specified.') + + return uri, headers, body + + def sign(self, uri, http_method=u'GET', body=None, headers=None): + """Sign a request + + Signs an HTTP request with the specified parts. + + Returns a 3-tuple of the signed request's URI, headers, and body. + Note that http_method is not returned as it is unaffected by the OAuth + signing process. + + The body argument may be a dict, a list of 2-tuples, or a formencoded + string. The Content-Type header must be 'application/x-www-form-urlencoded' + if it is present. + + If the body argument is not one of the above, it will be returned + verbatim as it is unaffected by the OAuth signing process. Attempting to + sign a request with non-formencoded data using the OAuth body signature + type is invalid and will raise an exception. + + If the body does contain parameters, it will be returned as a properly- + formatted formencoded string. + + All string data MUST be unicode. This includes strings inside body + dicts, for example. + """ + # normalize request data + request = Request(uri, http_method, body, headers) + + # sanity check + content_type = request.headers.get('Content-Type', None) + multipart = content_type and content_type.startswith('multipart/') + should_have_params = content_type == CONTENT_TYPE_FORM_URLENCODED + has_params = request.decoded_body is not None + # 3.4.1.3.1. Parameter Sources + # [Parameters are collected from the HTTP request entity-body, but only + # if [...]: + # * The entity-body is single-part. + if multipart and has_params: + raise ValueError("Headers indicate a multipart body but body contains parameters.") + # * The entity-body follows the encoding requirements of the + # "application/x-www-form-urlencoded" content-type as defined by + # [W3C.REC-html40-19980424]. + elif should_have_params and not has_params: + raise ValueError("Headers indicate a formencoded body but body was not decodable.") + # * The HTTP request entity-header includes the "Content-Type" + # header field set to "application/x-www-form-urlencoded". + elif not should_have_params and has_params: + raise ValueError("Body contains parameters but Content-Type header was not set.") + + # 3.5.2. Form-Encoded Body + # Protocol parameters can be transmitted in the HTTP request entity- + # body, but only if the following REQUIRED conditions are met: + # o The entity-body is single-part. + # o The entity-body follows the encoding requirements of the + # "application/x-www-form-urlencoded" content-type as defined by + # [W3C.REC-html40-19980424]. + # o The HTTP request entity-header includes the "Content-Type" header + # field set to "application/x-www-form-urlencoded". + elif self.signature_type == SIGNATURE_TYPE_BODY and not ( + should_have_params and has_params and not multipart): + raise ValueError('Body signatures may only be used with form-urlencoded content') + + # generate the basic OAuth parameters + request.oauth_params = self.get_oauth_params() + + # generate the signature + request.oauth_params.append((u'oauth_signature', self.get_oauth_signature(request))) + + # render the signed request and return it + return self._render(request, formencode=True) + + +class Server(object): + """A server used to verify OAuth 1.0 RFC 5849 requests""" + def __init__(self, signature_method=SIGNATURE_HMAC, rsa_key=None): + self.signature_method = signature_method + self.rsa_key = rsa_key + + def get_client_secret(self, client_key): + raise NotImplementedError("Subclasses must implement this function.") + + def get_resource_owner_secret(self, resource_owner_key): + raise NotImplementedError("Subclasses must implement this function.") + + def get_signature_type_and_params(self, uri_query, headers, body): + signature_types_with_oauth_params = filter(lambda s: s[1], ( + (SIGNATURE_TYPE_AUTH_HEADER, utils.filter_oauth_params( + signature.collect_parameters(headers=headers, + exclude_oauth_signature=False))), + (SIGNATURE_TYPE_BODY, utils.filter_oauth_params( + signature.collect_parameters(body=body, + exclude_oauth_signature=False))), + (SIGNATURE_TYPE_QUERY, utils.filter_oauth_params( + signature.collect_parameters(uri_query=uri_query, + exclude_oauth_signature=False))), + )) + + if len(signature_types_with_oauth_params) > 1: + raise ValueError('oauth_ params must come from only 1 signature type but were found in %s' % ', '.join( + [s[0] for s in signature_types_with_oauth_params])) + try: + signature_type, params = signature_types_with_oauth_params[0] + except IndexError: + raise ValueError('oauth_ params are missing. Could not determine signature type.') + + return signature_type, dict(params) + + def check_client_key(self, client_key): + raise NotImplementedError("Subclasses must implement this function.") + + def check_resource_owner_key(self, client_key, resource_owner_key): + raise NotImplementedError("Subclasses must implement this function.") + + def check_timestamp_and_nonce(self, timestamp, nonce): + raise NotImplementedError("Subclasses must implement this function.") + + def check_request_signature(self, uri, http_method=u'GET', body='', + headers=None): + """Check a request's supplied signature to make sure the request is + valid. + + Servers should return HTTP status 400 if a ValueError exception + is raised and HTTP status 401 on return value False. + + Per `section 3.2`_ of the spec. + + .. _`section 3.2`: http://tools.ietf.org/html/rfc5849#section-3.2 + """ + headers = headers or {} + signature_type = None + # FIXME: urlparse does not return unicode! + uri_query = urlparse.urlparse(uri).query + + signature_type, params = self.get_signature_type_and_params(uri_query, + headers, body) + + # the parameters may not include duplicate oauth entries + filtered_params = utils.filter_oauth_params(params) + if len(filtered_params) != len(params): + raise ValueError("Duplicate OAuth entries.") + + params = dict(params) + request_signature = params.get(u'oauth_signature') + client_key = params.get(u'oauth_consumer_key') + resource_owner_key = params.get(u'oauth_token') + nonce = params.get(u'oauth_nonce') + timestamp = params.get(u'oauth_timestamp') + callback_uri = params.get(u'oauth_callback') + verifier = params.get(u'oauth_verifier') + signature_method = params.get(u'oauth_signature_method') + + # ensure all mandatory parameters are present + if not all((request_signature, client_key, nonce, + timestamp, signature_method)): + raise ValueError("Missing OAuth parameters.") + + # if version is supplied, it must be "1.0" + if u'oauth_version' in params and params[u'oauth_version'] != u'1.0': + raise ValueError("Invalid OAuth version.") + + # signature method must be valid + if not signature_method in SIGNATURE_METHODS: + raise ValueError("Invalid signature method.") + + # ensure client key is valid + if not self.check_client_key(client_key): + return False + + # ensure resource owner key is valid and not expired + if not self.check_resource_owner_key(client_key, resource_owner_key): + return False + + # ensure the nonce and timestamp haven't been used before + if not self.check_timestamp_and_nonce(timestamp, nonce): + return False + + # FIXME: extract realm, then self.check_realm + + # oauth_client parameters depend on client chosen signature method + # which may vary for each request, section 3.4 + # HMAC-SHA1 and PLAINTEXT share parameters + if signature_method == SIGNATURE_RSA: + oauth_client = Client(client_key, + resource_owner_key=resource_owner_key, + callback_uri=callback_uri, + signature_method=signature_method, + signature_type=signature_type, + rsa_key=self.rsa_key, verifier=verifier) + else: + client_secret = self.get_client_secret(client_key) + resource_owner_secret = self.get_resource_owner_secret( + resource_owner_key) + oauth_client = Client(client_key, + client_secret=client_secret, + resource_owner_key=resource_owner_key, + resource_owner_secret=resource_owner_secret, + callback_uri=callback_uri, + signature_method=signature_method, + signature_type=signature_type, + verifier=verifier) + + request = Request(uri, http_method, body, headers) + request.oauth_params = params + + client_signature = oauth_client.get_oauth_signature(request) + + # FIXME: use near constant time string compare to avoid timing attacks + return client_signature == request_signature diff --git a/libs/oauthlib/oauth1/rfc5849/parameters.py b/libs/oauthlib/oauth1/rfc5849/parameters.py new file mode 100644 index 0000000..dee23a4 --- /dev/null +++ b/libs/oauthlib/oauth1/rfc5849/parameters.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +""" +oauthlib.parameters +~~~~~~~~~~~~~~~~~~~ + +This module contains methods related to `section 3.5`_ of the OAuth 1.0a spec. + +.. _`section 3.5`: http://tools.ietf.org/html/rfc5849#section-3.5 +""" + +from urlparse import urlparse, urlunparse +from . import utils +from oauthlib.common import extract_params, urlencode + + +# TODO: do we need filter_params now that oauth_params are handled by Request? +# We can easily pass in just oauth protocol params. +@utils.filter_params +def prepare_headers(oauth_params, headers=None, realm=None): + """**Prepare the Authorization header.** + Per `section 3.5.1`_ of the spec. + + Protocol parameters can be transmitted using the HTTP "Authorization" + header field as defined by `RFC2617`_ with the auth-scheme name set to + "OAuth" (case insensitive). + + For example:: + + Authorization: OAuth realm="Example", + oauth_consumer_key="0685bd9184jfhq22", + oauth_token="ad180jjd733klru7", + oauth_signature_method="HMAC-SHA1", + oauth_signature="wOJIO9A2W5mFwDgiDvZbTSMK%2FPY%3D", + oauth_timestamp="137131200", + oauth_nonce="4572616e48616d6d65724c61686176", + oauth_version="1.0" + + + .. _`section 3.5.1`: http://tools.ietf.org/html/rfc5849#section-3.5.1 + .. _`RFC2617`: http://tools.ietf.org/html/rfc2617 + """ + headers = headers or {} + + # Protocol parameters SHALL be included in the "Authorization" header + # field as follows: + authorization_header_parameters_parts = [] + for oauth_parameter_name, value in oauth_params: + # 1. Parameter names and values are encoded per Parameter Encoding + # (`Section 3.6`_) + # + # .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 + escaped_name = utils.escape(oauth_parameter_name) + escaped_value = utils.escape(value) + + # 2. Each parameter's name is immediately followed by an "=" character + # (ASCII code 61), a """ character (ASCII code 34), the parameter + # value (MAY be empty), and another """ character (ASCII code 34). + part = u'{0}="{1}"'.format(escaped_name, escaped_value) + + authorization_header_parameters_parts.append(part) + + # 3. Parameters are separated by a "," character (ASCII code 44) and + # OPTIONAL linear whitespace per `RFC2617`_. + # + # .. _`RFC2617`: http://tools.ietf.org/html/rfc2617 + authorization_header_parameters = ', '.join( + authorization_header_parameters_parts) + + # 4. The OPTIONAL "realm" parameter MAY be added and interpreted per + # `RFC2617 section 1.2`_. + # + # .. _`RFC2617 section 1.2`: http://tools.ietf.org/html/rfc2617#section-1.2 + if realm: + # NOTE: realm should *not* be escaped + authorization_header_parameters = (u'realm="%s", ' % realm + + authorization_header_parameters) + + # the auth-scheme name set to "OAuth" (case insensitive). + authorization_header = u'OAuth %s' % authorization_header_parameters + + # contribute the Authorization header to the given headers + full_headers = {} + full_headers.update(headers) + full_headers[u'Authorization'] = authorization_header + return full_headers + + +def _append_params(oauth_params, params): + """Append OAuth params to an existing set of parameters. + + Both params and oauth_params is must be lists of 2-tuples. + + Per `section 3.5.2`_ and `3.5.3`_ of the spec. + + .. _`section 3.5.2`: http://tools.ietf.org/html/rfc5849#section-3.5.2 + .. _`3.5.3`: http://tools.ietf.org/html/rfc5849#section-3.5.3 + + """ + merged = list(params) + merged.extend(oauth_params) + # The request URI / entity-body MAY include other request-specific + # parameters, in which case, the protocol parameters SHOULD be appended + # following the request-specific parameters, properly separated by an "&" + # character (ASCII code 38) + merged.sort(key=lambda i: i[0].startswith('oauth_')) + return merged + + +def prepare_form_encoded_body(oauth_params, body): + """Prepare the Form-Encoded Body. + + Per `section 3.5.2`_ of the spec. + + .. _`section 3.5.2`: http://tools.ietf.org/html/rfc5849#section-3.5.2 + + """ + # append OAuth params to the existing body + return _append_params(oauth_params, body) + + +def prepare_request_uri_query(oauth_params, uri): + """Prepare the Request URI Query. + + Per `section 3.5.3`_ of the spec. + + .. _`section 3.5.3`: http://tools.ietf.org/html/rfc5849#section-3.5.3 + + """ + # append OAuth params to the existing set of query components + sch, net, path, par, query, fra = urlparse(uri) + query = urlencode(_append_params(oauth_params, extract_params(query) or [])) + return urlunparse((sch, net, path, par, query, fra)) diff --git a/libs/oauthlib/oauth1/rfc5849/signature.py b/libs/oauthlib/oauth1/rfc5849/signature.py new file mode 100644 index 0000000..99101d4 --- /dev/null +++ b/libs/oauthlib/oauth1/rfc5849/signature.py @@ -0,0 +1,501 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import +""" +oauthlib.oauth1.rfc5849.signature +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This module represents a direct implementation of `section 3.4`_ of the spec. + +Terminology: + * Client: software interfacing with an OAuth API + * Server: the API provider + * Resource Owner: the user who is granting authorization to the client + +Steps for signing a request: + +1. Collect parameters from the uri query, auth header, & body +2. Normalize those parameters +3. Normalize the uri +4. Pass the normalized uri, normalized parameters, and http method to + construct the base string +5. Pass the base string and any keys needed to a signing function + +.. _`section 3.4`: http://tools.ietf.org/html/rfc5849#section-3.4 +""" +import binascii +import hashlib +import hmac +import urlparse +from . import utils +from oauthlib.common import extract_params + + +def construct_base_string(http_method, base_string_uri, + normalized_encoded_request_parameters): + """**String Construction** + Per `section 3.4.1.1`_ of the spec. + + For example, the HTTP request:: + + POST /request?b5=%3D%253D&a3=a&c%40=&a2=r%20b HTTP/1.1 + Host: example.com + Content-Type: application/x-www-form-urlencoded + Authorization: OAuth realm="Example", + oauth_consumer_key="9djdj82h48djs9d2", + oauth_token="kkk9d7dh3k39sjv7", + oauth_signature_method="HMAC-SHA1", + oauth_timestamp="137131201", + oauth_nonce="7d8f3e4a", + oauth_signature="bYT5CMsGcbgUdFHObYMEfcx6bsw%3D" + + c2&a3=2+q + + is represented by the following signature base string (line breaks + are for display purposes only):: + + POST&http%3A%2F%2Fexample.com%2Frequest&a2%3Dr%2520b%26a3%3D2%2520q + %26a3%3Da%26b5%3D%253D%25253D%26c%2540%3D%26c2%3D%26oauth_consumer_ + key%3D9djdj82h48djs9d2%26oauth_nonce%3D7d8f3e4a%26oauth_signature_m + ethod%3DHMAC-SHA1%26oauth_timestamp%3D137131201%26oauth_token%3Dkkk + 9d7dh3k39sjv7 + + .. _`section 3.4.1.1`: http://tools.ietf.org/html/rfc5849#section-3.4.1.1 + """ + + # The signature base string is constructed by concatenating together, + # in order, the following HTTP request elements: + + # 1. The HTTP request method in uppercase. For example: "HEAD", + # "GET", "POST", etc. If the request uses a custom HTTP method, it + # MUST be encoded (`Section 3.6`_). + # + # .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 + base_string = utils.escape(http_method.upper()) + + # 2. An "&" character (ASCII code 38). + base_string += u'&' + + # 3. The base string URI from `Section 3.4.1.2`_, after being encoded + # (`Section 3.6`_). + # + # .. _`Section 3.4.1.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.2 + # .. _`Section 3.4.6`: http://tools.ietf.org/html/rfc5849#section-3.4.6 + base_string += utils.escape(base_string_uri) + + # 4. An "&" character (ASCII code 38). + base_string += u'&' + + # 5. The request parameters as normalized in `Section 3.4.1.3.2`_, after + # being encoded (`Section 3.6`). + # + # .. _`Section 3.4.1.3.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3.2 + # .. _`Section 3.4.6`: http://tools.ietf.org/html/rfc5849#section-3.4.6 + base_string += utils.escape(normalized_encoded_request_parameters) + + return base_string + + +def normalize_base_string_uri(uri): + """**Base String URI** + Per `section 3.4.1.2`_ of the spec. + + For example, the HTTP request:: + + GET /r%20v/X?id=123 HTTP/1.1 + Host: EXAMPLE.COM:80 + + is represented by the base string URI: "http://example.com/r%20v/X". + + In another example, the HTTPS request:: + + GET /?q=1 HTTP/1.1 + Host: www.example.net:8080 + + is represented by the base string URI: "https://www.example.net:8080/". + + .. _`section 3.4.1.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.2 + """ + if not isinstance(uri, unicode): + raise ValueError('uri must be a unicode object.') + + # FIXME: urlparse does not support unicode + scheme, netloc, path, params, query, fragment = urlparse.urlparse(uri) + + # The scheme, authority, and path of the request resource URI `RFC3986` + # are included by constructing an "http" or "https" URI representing + # the request resource (without the query or fragment) as follows: + # + # .. _`RFC2616`: http://tools.ietf.org/html/rfc3986 + + # 1. The scheme and host MUST be in lowercase. + scheme = scheme.lower() + netloc = netloc.lower() + + # 2. The host and port values MUST match the content of the HTTP + # request "Host" header field. + # TODO: enforce this constraint + + # 3. The port MUST be included if it is not the default port for the + # scheme, and MUST be excluded if it is the default. Specifically, + # the port MUST be excluded when making an HTTP request `RFC2616`_ + # to port 80 or when making an HTTPS request `RFC2818`_ to port 443. + # All other non-default port numbers MUST be included. + # + # .. _`RFC2616`: http://tools.ietf.org/html/rfc2616 + # .. _`RFC2818`: http://tools.ietf.org/html/rfc2818 + default_ports = ( + (u'http', u'80'), + (u'https', u'443'), + ) + if u':' in netloc: + host, port = netloc.split(u':', 1) + if (scheme, port) in default_ports: + netloc = host + + return urlparse.urlunparse((scheme, netloc, path, u'', u'', u'')) + + +# ** Request Parameters ** +# +# Per `section 3.4.1.3`_ of the spec. +# +# In order to guarantee a consistent and reproducible representation of +# the request parameters, the parameters are collected and decoded to +# their original decoded form. They are then sorted and encoded in a +# particular manner that is often different from their original +# encoding scheme, and concatenated into a single string. +# +# .. _`section 3.4.1.3`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3 + +def collect_parameters(uri_query='', body=[], headers=None, + exclude_oauth_signature=True): + """**Parameter Sources** + + Parameters starting with `oauth_` will be unescaped. + + Body parameters must be supplied as a dict, a list of 2-tuples, or a + formencoded query string. + + Headers must be supplied as a dict. + + Per `section 3.4.1.3.1`_ of the spec. + + For example, the HTTP request:: + + POST /request?b5=%3D%253D&a3=a&c%40=&a2=r%20b HTTP/1.1 + Host: example.com + Content-Type: application/x-www-form-urlencoded + Authorization: OAuth realm="Example", + oauth_consumer_key="9djdj82h48djs9d2", + oauth_token="kkk9d7dh3k39sjv7", + oauth_signature_method="HMAC-SHA1", + oauth_timestamp="137131201", + oauth_nonce="7d8f3e4a", + oauth_signature="djosJKDKJSD8743243%2Fjdk33klY%3D" + + c2&a3=2+q + + contains the following (fully decoded) parameters used in the + signature base sting:: + + +------------------------+------------------+ + | Name | Value | + +------------------------+------------------+ + | b5 | =%3D | + | a3 | a | + | c@ | | + | a2 | r b | + | oauth_consumer_key | 9djdj82h48djs9d2 | + | oauth_token | kkk9d7dh3k39sjv7 | + | oauth_signature_method | HMAC-SHA1 | + | oauth_timestamp | 137131201 | + | oauth_nonce | 7d8f3e4a | + | c2 | | + | a3 | 2 q | + +------------------------+------------------+ + + Note that the value of "b5" is "=%3D" and not "==". Both "c@" and + "c2" have empty values. While the encoding rules specified in this + specification for the purpose of constructing the signature base + string exclude the use of a "+" character (ASCII code 43) to + represent an encoded space character (ASCII code 32), this practice + is widely used in "application/x-www-form-urlencoded" encoded values, + and MUST be properly decoded, as demonstrated by one of the "a3" + parameter instances (the "a3" parameter is used twice in this + request). + + .. _`section 3.4.1.3.1`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3.1 + """ + headers = headers or {} + params = [] + + # The parameters from the following sources are collected into a single + # list of name/value pairs: + + # * The query component of the HTTP request URI as defined by + # `RFC3986, Section 3.4`_. The query component is parsed into a list + # of name/value pairs by treating it as an + # "application/x-www-form-urlencoded" string, separating the names + # and values and decoding them as defined by + # `W3C.REC-html40-19980424`_, Section 17.13.4. + # + # .. _`RFC3986, Section 3.4`: http://tools.ietf.org/html/rfc3986#section-3.4 + # .. _`W3C.REC-html40-19980424`: http://tools.ietf.org/html/rfc5849#ref-W3C.REC-html40-19980424 + if uri_query: + params.extend(urlparse.parse_qsl(uri_query, keep_blank_values=True)) + + # * The OAuth HTTP "Authorization" header field (`Section 3.5.1`_) if + # present. The header's content is parsed into a list of name/value + # pairs excluding the "realm" parameter if present. The parameter + # values are decoded as defined by `Section 3.5.1`_. + # + # .. _`Section 3.5.1`: http://tools.ietf.org/html/rfc5849#section-3.5.1 + if headers: + headers_lower = dict((k.lower(), v) for k, v in headers.items()) + authorization_header = headers_lower.get(u'authorization') + if authorization_header is not None: + params.extend([i for i in utils.parse_authorization_header( + authorization_header) if i[0] != u'realm']) + + # * The HTTP request entity-body, but only if all of the following + # conditions are met: + # * The entity-body is single-part. + # + # * The entity-body follows the encoding requirements of the + # "application/x-www-form-urlencoded" content-type as defined by + # `W3C.REC-html40-19980424`_. + + # * The HTTP request entity-header includes the "Content-Type" + # header field set to "application/x-www-form-urlencoded". + # + # .._`W3C.REC-html40-19980424`: http://tools.ietf.org/html/rfc5849#ref-W3C.REC-html40-19980424 + + # TODO: enforce header param inclusion conditions + bodyparams = extract_params(body) or [] + params.extend(bodyparams) + + # ensure all oauth params are unescaped + unescaped_params = [] + for k, v in params: + if k.startswith(u'oauth_'): + v = utils.unescape(v) + unescaped_params.append((k, v)) + + # The "oauth_signature" parameter MUST be excluded from the signature + # base string if present. + if exclude_oauth_signature: + unescaped_params = filter(lambda i: i[0] != u'oauth_signature', + unescaped_params) + + return unescaped_params + + +def normalize_parameters(params): + """**Parameters Normalization** + Per `section 3.4.1.3.2`_ of the spec. + + For example, the list of parameters from the previous section would + be normalized as follows: + + Encoded:: + + +------------------------+------------------+ + | Name | Value | + +------------------------+------------------+ + | b5 | %3D%253D | + | a3 | a | + | c%40 | | + | a2 | r%20b | + | oauth_consumer_key | 9djdj82h48djs9d2 | + | oauth_token | kkk9d7dh3k39sjv7 | + | oauth_signature_method | HMAC-SHA1 | + | oauth_timestamp | 137131201 | + | oauth_nonce | 7d8f3e4a | + | c2 | | + | a3 | 2%20q | + +------------------------+------------------+ + + Sorted:: + + +------------------------+------------------+ + | Name | Value | + +------------------------+------------------+ + | a2 | r%20b | + | a3 | 2%20q | + | a3 | a | + | b5 | %3D%253D | + | c%40 | | + | c2 | | + | oauth_consumer_key | 9djdj82h48djs9d2 | + | oauth_nonce | 7d8f3e4a | + | oauth_signature_method | HMAC-SHA1 | + | oauth_timestamp | 137131201 | + | oauth_token | kkk9d7dh3k39sjv7 | + +------------------------+------------------+ + + Concatenated Pairs:: + + +-------------------------------------+ + | Name=Value | + +-------------------------------------+ + | a2=r%20b | + | a3=2%20q | + | a3=a | + | b5=%3D%253D | + | c%40= | + | c2= | + | oauth_consumer_key=9djdj82h48djs9d2 | + | oauth_nonce=7d8f3e4a | + | oauth_signature_method=HMAC-SHA1 | + | oauth_timestamp=137131201 | + | oauth_token=kkk9d7dh3k39sjv7 | + +-------------------------------------+ + + and concatenated together into a single string (line breaks are for + display purposes only):: + + a2=r%20b&a3=2%20q&a3=a&b5=%3D%253D&c%40=&c2=&oauth_consumer_key=9dj + dj82h48djs9d2&oauth_nonce=7d8f3e4a&oauth_signature_method=HMAC-SHA1 + &oauth_timestamp=137131201&oauth_token=kkk9d7dh3k39sjv7 + + .. _`section 3.4.1.3.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3.2 + """ + + # The parameters collected in `Section 3.4.1.3`_ are normalized into a + # single string as follows: + # + # .. _`Section 3.4.1.3`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3 + + # 1. First, the name and value of each parameter are encoded + # (`Section 3.6`_). + # + # .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 + key_values = [(utils.escape(k), utils.escape(v)) for k, v in params] + + # 2. The parameters are sorted by name, using ascending byte value + # ordering. If two or more parameters share the same name, they + # are sorted by their value. + key_values.sort() + + # 3. The name of each parameter is concatenated to its corresponding + # value using an "=" character (ASCII code 61) as a separator, even + # if the value is empty. + parameter_parts = [u'{0}={1}'.format(k, v) for k, v in key_values] + + # 4. The sorted name/value pairs are concatenated together into a + # single string by using an "&" character (ASCII code 38) as + # separator. + return u'&'.join(parameter_parts) + + +def sign_hmac_sha1(base_string, client_secret, resource_owner_secret): + """**HMAC-SHA1** + + The "HMAC-SHA1" signature method uses the HMAC-SHA1 signature + algorithm as defined in `RFC2104`_:: + + digest = HMAC-SHA1 (key, text) + + Per `section 3.4.2`_ of the spec. + + .. _`RFC2104`: http://tools.ietf.org/html/rfc2104 + .. _`section 3.4.2`: http://tools.ietf.org/html/rfc5849#section-3.4.2 + """ + + # The HMAC-SHA1 function variables are used in following way: + + # text is set to the value of the signature base string from + # `Section 3.4.1.1`_. + # + # .. _`Section 3.4.1.1`: http://tools.ietf.org/html/rfc5849#section-3.4.1.1 + text = base_string + + # key is set to the concatenated values of: + # 1. The client shared-secret, after being encoded (`Section 3.6`_). + # + # .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 + key = utils.escape(client_secret or u'') + + # 2. An "&" character (ASCII code 38), which MUST be included + # even when either secret is empty. + key += u'&' + + # 3. The token shared-secret, after being encoded (`Section 3.6`_). + # + # .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 + key += utils.escape(resource_owner_secret or u'') + + # FIXME: HMAC does not support unicode! + key_utf8 = key.encode('utf-8') + text_utf8 = text.encode('utf-8') + signature = hmac.new(key_utf8, text_utf8, hashlib.sha1) + + # digest is used to set the value of the "oauth_signature" protocol + # parameter, after the result octet string is base64-encoded + # per `RFC2045, Section 6.8`. + # + # .. _`RFC2045, Section 6.8`: http://tools.ietf.org/html/rfc2045#section-6.8 + return binascii.b2a_base64(signature.digest())[:-1].decode('utf-8') + + +def sign_rsa_sha1(base_string, rsa_private_key): + """**RSA-SHA1** + + Per `section 3.4.3`_ of the spec. + + The "RSA-SHA1" signature method uses the RSASSA-PKCS1-v1_5 signature + algorithm as defined in `RFC3447, Section 8.2`_ (also known as + PKCS#1), using SHA-1 as the hash function for EMSA-PKCS1-v1_5. To + use this method, the client MUST have established client credentials + with the server that included its RSA public key (in a manner that is + beyond the scope of this specification). + + NOTE: this method requires the python-rsa library. + + .. _`section 3.4.3`: http://tools.ietf.org/html/rfc5849#section-3.4.3 + .. _`RFC3447, Section 8.2`: http://tools.ietf.org/html/rfc3447#section-8.2 + + """ + + # TODO: finish RSA documentation + + import rsa + key = rsa.PrivateKey.load_pkcs1(rsa_private_key) + sig = rsa.sign(base_string, key, 'SHA-1') + return binascii.b2a_base64(sig)[:-1] + + +def sign_plaintext(client_secret, resource_owner_secret): + """Sign a request using plaintext. + + Per `section 3.4.4`_ of the spec. + + The "PLAINTEXT" method does not employ a signature algorithm. It + MUST be used with a transport-layer mechanism such as TLS or SSL (or + sent over a secure channel with equivalent protections). It does not + utilize the signature base string or the "oauth_timestamp" and + "oauth_nonce" parameters. + + .. _`section 3.4.4`: http://tools.ietf.org/html/rfc5849#section-3.4.4 + + """ + + # The "oauth_signature" protocol parameter is set to the concatenated + # value of: + + # 1. The client shared-secret, after being encoded (`Section 3.6`_). + # + # .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 + signature = utils.escape(client_secret or u'') + + # 2. An "&" character (ASCII code 38), which MUST be included even + # when either secret is empty. + signature += u'&' + + # 3. The token shared-secret, after being encoded (`Section 3.6`_). + # + # .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 + signature += utils.escape(resource_owner_secret or u'') + + return signature + diff --git a/libs/oauthlib/oauth1/rfc5849/utils.py b/libs/oauthlib/oauth1/rfc5849/utils.py new file mode 100644 index 0000000..6db446f --- /dev/null +++ b/libs/oauthlib/oauth1/rfc5849/utils.py @@ -0,0 +1,141 @@ +# -*- coding: utf-8 -*- + +""" +oauthlib.utils +~~~~~~~~~~~~~~ + +This module contains utility methods used by various parts of the OAuth +spec. +""" + +import string +import time +import urllib2 +from random import getrandbits, choice + +from oauthlib.common import quote, unquote + +UNICODE_ASCII_CHARACTER_SET = (string.ascii_letters.decode('ascii') + + string.digits.decode('ascii')) + + +def filter_params(target): + """Decorator which filters params to remove non-oauth_* parameters + + Assumes the decorated method takes a params dict or list of tuples as its + first argument. + """ + def wrapper(params, *args, **kwargs): + params = filter_oauth_params(params) + return target(params, *args, **kwargs) + + wrapper.__doc__ = target.__doc__ + return wrapper + + +def filter_oauth_params(params): + """Removes all non oauth parameters from a dict or a list of params.""" + is_oauth = lambda kv: kv[0].startswith(u"oauth_") + if isinstance(params, dict): + return filter(is_oauth, params.items()) + else: + return filter(is_oauth, params) + + +def generate_timestamp(): + """Get seconds since epoch (UTC). + + Per `section 3.3`_ of the spec. + + .. _`section 3.3`: http://tools.ietf.org/html/rfc5849#section-3.3 + """ + return unicode(int(time.time())) + + +def generate_nonce(): + """Generate pseudorandom nonce that is unlikely to repeat. + + Per `section 3.3`_ of the spec. + + A random 64-bit number is appended to the epoch timestamp for both + randomness and to decrease the likelihood of collisions. + + .. _`section 3.3`: http://tools.ietf.org/html/rfc5849#section-3.3 + """ + return unicode(getrandbits(64)) + generate_timestamp() + + +def generate_token(length=20, chars=UNICODE_ASCII_CHARACTER_SET): + """Generates a generic OAuth token + + According to `section 2`_ of the spec, the method of token + construction is undefined. This implementation is simply a random selection + of `length` choices from `chars`. + + Credit to Ignacio Vazquez-Abrams for his excellent `Stackoverflow answer`_ + + .. _`Stackoverflow answer` : http://stackoverflow.com/questions/2257441/ + python-random-string-generation-with-upper-case-letters-and-digits + + """ + return u''.join(choice(chars) for x in range(length)) + + +def escape(u): + """Escape a unicode string in an OAuth-compatible fashion. + + Per `section 3.6`_ of the spec. + + .. _`section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 + + """ + if not isinstance(u, unicode): + raise ValueError('Only unicode objects are escapable.') + # Letters, digits, and the characters '_.-' are already treated as safe + # by urllib.quote(). We need to add '~' to fully support rfc5849. + return quote(u, safe='~') + + +def unescape(u): + if not isinstance(u, unicode): + raise ValueError('Only unicode objects are unescapable.') + return unquote(u) + + +def urlencode(query): + """Encode a sequence of two-element tuples or dictionary into a URL query string. + + Operates using an OAuth-safe escape() method, in contrast to urllib.urlencode. + """ + # Convert dictionaries to list of tuples + if isinstance(query, dict): + query = query.items() + return u"&".join([u'='.join([escape(k), escape(v)]) for k, v in query]) + + +def parse_keqv_list(l): + """A unicode-safe version of urllib2.parse_keqv_list""" + encoded_list = [u.encode('utf-8') for u in l] + encoded_parsed = urllib2.parse_keqv_list(encoded_list) + return dict((k.decode('utf-8'), + v.decode('utf-8')) for k,v in encoded_parsed.items()) + + +def parse_http_list(u): + """A unicode-safe version of urllib2.parse_http_list""" + encoded_str = u.encode('utf-8') + encoded_list = urllib2.parse_http_list(encoded_str) + return [s.decode('utf-8') for s in encoded_list] + + +def parse_authorization_header(authorization_header): + """Parse an OAuth authorization header into a list of 2-tuples""" + auth_scheme = u'OAuth ' + if authorization_header.startswith(auth_scheme): + authorization_header = authorization_header.replace(auth_scheme, u'', 1) + items = parse_http_list(authorization_header) + try: + return parse_keqv_list(items).items() + except ValueError: + raise ValueError('Malformed authorization header') + diff --git a/libs/oauthlib/oauth2/__init__.py b/libs/oauthlib/oauth2/__init__.py new file mode 100644 index 0000000..0e8933c --- /dev/null +++ b/libs/oauthlib/oauth2/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +""" +oauthlib.oauth2 +~~~~~~~~~~~~~~ + +This module is a wrapper for the most recent implementation of OAuth 2.0 Client +and Server classes. +""" + +from .draft25 import Client, Server + diff --git a/libs/oauthlib/oauth2/draft25/__init__.py b/libs/oauthlib/oauth2/draft25/__init__.py new file mode 100644 index 0000000..3e50a18 --- /dev/null +++ b/libs/oauthlib/oauth2/draft25/__init__.py @@ -0,0 +1,14 @@ +""" +oauthlib.oauth2.draft_25 +~~~~~~~~~~~~~~ + +This module is an implementation of various logic needed +for signing and checking OAuth 2.0 draft 25 requests. +""" + +class Client(object): + pass + +class Server(object): + pass + diff --git a/libs/oauthlib/oauth2/draft25/tokens.py b/libs/oauthlib/oauth2/draft25/tokens.py new file mode 100644 index 0000000..9b5f586 --- /dev/null +++ b/libs/oauthlib/oauth2/draft25/tokens.py @@ -0,0 +1,131 @@ +from __future__ import absolute_import +""" +oauthlib.oauth2.draft25.tokens +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This module contains methods for adding two types of access tokens to requests. + +- Bearer http://tools.ietf.org/html/draft-ietf-oauth-saml2-bearer-08 +- MAC http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-00 + +""" +from binascii import b2a_base64 +import hashlib +import hmac +from urlparse import urlparse + +from . import utils + + +def prepare_mac_header(token, uri, key, http_method, nonce=None, headers=None, + body=None, ext=u'', hash_algorithm=u'hmac-sha-1'): + """Add an `MAC Access Authentication`_ signature to headers. + + Unlike OAuth 1, this HMAC signature does not require inclusion of the request + payload/body, neither does it use a combination of client_secret and + token_secret but rather a mac_key provided together with the access token. + + Currently two algorithms are supported, "hmac-sha-1" and "hmac-sha-256", + `extension algorithms`_ are not supported. + + Example MAC Authorization header, linebreaks added for clarity + + Authorization: MAC id="h480djs93hd8", + nonce="1336363200:dj83hs9s", + mac="bhCQXTVyfj5cmA9uKkPFx1zeOXM=" + + .. _`MAC Access Authentication`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01 + .. _`extension algorithms`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01#section-7.1 + + :param uri: Request URI. + :param headers: Request headers as a dictionary. + :param http_method: HTTP Request method. + :param key: MAC given provided by token endpoint. + :param algorithm: HMAC algorithm provided by token endpoint. + :return: headers dictionary with the authorization field added. + """ + http_method = http_method.upper() + host, port = utils.host_from_uri(uri) + + if hash_algorithm.lower() == u'hmac-sha-1': + h = hashlib.sha1 + else: + h = hashlib.sha256 + + nonce = nonce or u'{0}:{1}'.format(utils.generate_nonce(), utils.generate_timestamp()) + sch, net, path, par, query, fra = urlparse(uri) + + if query: + request_uri = path + u'?' + query + else: + request_uri = path + + # Hash the body/payload + if body is not None: + bodyhash = b2a_base64(h(body).digest())[:-1].decode('utf-8') + else: + bodyhash = u'' + + # Create the normalized base string + base = [] + base.append(nonce) + base.append(http_method.upper()) + base.append(request_uri) + base.append(host) + base.append(port) + base.append(bodyhash) + base.append(ext) + base_string = '\n'.join(base) + u'\n' + + # hmac struggles with unicode strings - http://bugs.python.org/issue5285 + if isinstance(key, unicode): + key = key.encode('utf-8') + sign = hmac.new(key, base_string, h) + sign = b2a_base64(sign.digest())[:-1].decode('utf-8') + + header = [] + header.append(u'MAC id="%s"' % token) + header.append(u'nonce="%s"' % nonce) + if bodyhash: + header.append(u'bodyhash="%s"' % bodyhash) + if ext: + header.append(u'ext="%s"' % ext) + header.append(u'mac="%s"' % sign) + + headers = headers or {} + headers[u'Authorization'] = u', '.join(header) + return headers + + +def prepare_bearer_uri(token, uri): + """Add a `Bearer Token`_ to the request URI. + Not recommended, use only if client can't use authorization header or body. + + http://www.example.com/path?access_token=h480djs93hd8 + + .. _`Bearer Token`: http://tools.ietf.org/html/draft-ietf-oauth-v2-bearer-18 + """ + return utils.add_params_to_uri(uri, [((u'access_token', token))]) + + +def prepare_bearer_headers(token, headers=None): + """Add a `Bearer Token`_ to the request URI. + Recommended method of passing bearer tokens. + + Authorization: Bearer h480djs93hd8 + + .. _`Bearer Token`: http://tools.ietf.org/html/draft-ietf-oauth-v2-bearer-18 + """ + headers = headers or {} + headers[u'Authorization'] = u'Bearer %s' % token + return headers + + +def prepare_bearer_body(token, body=u''): + """Add a `Bearer Token`_ to the request body. + + access_token=h480djs93hd8 + + .. _`Bearer Token`: http://tools.ietf.org/html/draft-ietf-oauth-v2-bearer-18 + """ + return utils.add_params_to_qs(body, [((u'access_token', token))]) diff --git a/libs/oauthlib/oauth2/draft25/utils.py b/libs/oauthlib/oauth2/draft25/utils.py new file mode 100644 index 0000000..48b4ea1 --- /dev/null +++ b/libs/oauthlib/oauth2/draft25/utils.py @@ -0,0 +1,128 @@ +""" +oauthlib.utils +~~~~~~~~~~~~~~ + +This module contains utility methods used by various parts of the OAuth 2 spec. +""" + +import random +import string +import time +import urllib +from urlparse import urlparse, urlunparse, parse_qsl + +UNICODE_ASCII_CHARACTER_SET = (string.ascii_letters.decode('ascii') + + string.digits.decode('ascii')) + +def add_params_to_qs(query, params): + """Extend a query with a list of two-tuples. + + :param query: Query string. + :param params: List of two-tuples. + :return: extended query + """ + queryparams = parse_qsl(query, keep_blank_values=True) + queryparams.extend(params) + return urlencode(queryparams) + + +def add_params_to_uri(uri, params): + """Add a list of two-tuples to the uri query components. + + :param uri: Full URI. + :param params: List of two-tuples. + :return: uri with extended query + """ + sch, net, path, par, query, fra = urlparse(uri) + query = add_params_to_qs(query, params) + return urlunparse((sch, net, path, par, query, fra)) + + +def escape(u): + """Escape a string in an OAuth-compatible fashion. + + Per `section 3.6`_ of the spec. + + .. _`section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 + + """ + if not isinstance(u, unicode): + raise ValueError('Only unicode objects are escapable.') + return urllib.quote(u.encode('utf-8'), safe='~') + + +def generate_nonce(): + """Generate pseudorandom nonce that is unlikely to repeat. + + Per `section 3.2.1`_ of the MAC Access Authentication spec. + + A random 64-bit number is appended to the epoch timestamp for both + randomness and to decrease the likelihood of collisions. + + .. _`section 3.2.1`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01#section-3.2.1 + """ + return unicode(unicode(random.getrandbits(64)) + generate_timestamp()) + + +def generate_timestamp(): + """Get seconds since epoch (UTC). + + Per `section 3.2.1`_ of the MAC Access Authentication spec. + + .. _`section 3.2.1`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01#section-3.2.1 + """ + return unicode(int(time.time())) + + +def generate_token(length=20, chars=UNICODE_ASCII_CHARACTER_SET): + """Generates a generic OAuth 2 token + + According to `section 1.4`_ and `section 1.5` of the spec, the method of token + construction is undefined. This implementation is simply a random selection + of `length` choices from `chars`. SystemRandom is used since it provides + higher entropy than random.choice. + + .. _`section 1.4`: http://tools.ietf.org/html/draft-ietf-oauth-v2-25#section-1.4 + .. _`section 1.5`: http://tools.ietf.org/html/draft-ietf-oauth-v2-25#section-1.5 + """ + rand = random.SystemRandom() + return u''.join(rand.choice(chars) for x in range(length)) + + +def host_from_uri(uri): + """Extract hostname and port from URI. + + Will use default port for HTTP and HTTPS if none is present in the URI. + + >>> host_from_uri(u'https://www.example.com/path?query') + u'www.example.com', u'443' + >>> host_from_uri(u'http://www.example.com:8080/path?query') + u'www.example.com', u'8080' + + :param uri: Full URI. + :param http_method: HTTP request method. + :return: hostname, port + """ + default_ports = { + u'HTTP' : u'80', + u'HTTPS' : u'443', + } + + sch, netloc, path, par, query, fra = urlparse(uri) + if u':' in netloc: + netloc, port = netloc.split(u':', 1) + else: + port = default_ports.get(sch.upper()) + + return netloc, port + + +def urlencode(query): + """Encode a sequence of two-element tuples or dictionary into a URL query string. + + Operates using an OAuth-safe escape() method, in contrast to urllib.urlenocde. + """ + # Convert dictionaries to list of tuples + if isinstance(query, dict): + query = query.items() + return "&".join(['='.join([escape(k), escape(v)]) for k, v in query]) diff --git a/libs/subliminal/api.py b/libs/subliminal/api.py index baff5f1..a7baeab 100755 --- a/libs/subliminal/api.py +++ b/libs/subliminal/api.py @@ -18,7 +18,8 @@ from .core import (SERVICES, LANGUAGE_INDEX, SERVICE_INDEX, SERVICE_CONFIDENCE, MATCHING_CONFIDENCE, create_list_tasks, consume_task, create_download_tasks, group_by_video, key_subtitles) -from .languages import list_languages +import guessit +from guessit.language import ALL_LANGUAGES import logging @@ -26,7 +27,7 @@ __all__ = ['list_subtitles', 'download_subtitles'] logger = logging.getLogger(__name__) -def list_subtitles(paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3): +def list_subtitles(paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3, scan_filter=None): """List subtitles in given paths according to the criteria :param paths: path(s) to video file or folder @@ -37,19 +38,20 @@ def list_subtitles(paths, languages=None, services=None, force=True, multi=False :param bool multi: search multiple languages for the same video :param string cache_dir: path to the cache directory to use :param int max_depth: maximum depth for scanning entries + :param function scan_filter: filter function that takes a path as argument and returns a boolean indicating whether it has to be filtered out (``True``) or not (``False``) :return: found subtitles :rtype: dict of :class:`~subliminal.videos.Video` => [:class:`~subliminal.subtitles.ResultSubtitle`] """ services = services or SERVICES - languages = set(languages or list_languages(1)) + languages = set(map(guessit.Language, languages or []) or ALL_LANGUAGES) if isinstance(paths, basestring): paths = [paths] if any([not isinstance(p, unicode) for p in paths]): logger.warning(u'Not all entries are unicode') results = [] service_instances = {} - tasks = create_list_tasks(paths, languages, services, force, multi, cache_dir, max_depth) + tasks = create_list_tasks(paths, languages, services, force, multi, cache_dir, max_depth, scan_filter) for task in tasks: try: result = consume_task(task, service_instances) @@ -61,7 +63,7 @@ def list_subtitles(paths, languages=None, services=None, force=True, multi=False return group_by_video(results) -def download_subtitles(paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3, order=None): +def download_subtitles(paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3, scan_filter=None, order=None): """Download subtitles in given paths according to the criteria :param paths: path(s) to video file or folder @@ -72,6 +74,7 @@ def download_subtitles(paths, languages=None, services=None, force=True, multi=F :param bool multi: search multiple languages for the same video :param string cache_dir: path to the cache directory to use :param int max_depth: maximum depth for scanning entries + :param function scan_filter: filter function that takes a path as argument and returns a boolean indicating whether it has to be filtered out (``True``) or not (``False``) :param order: preferred order for subtitles sorting :type list: list of :data:`~subliminal.core.LANGUAGE_INDEX`, :data:`~subliminal.core.SERVICE_INDEX`, :data:`~subliminal.core.SERVICE_CONFIDENCE`, :data:`~subliminal.core.MATCHING_CONFIDENCE` :return: found subtitles @@ -79,11 +82,11 @@ def download_subtitles(paths, languages=None, services=None, force=True, multi=F """ services = services or SERVICES - languages = languages or list_languages(1) + languages = map(guessit.Language, languages or []) or list(ALL_LANGUAGES) if isinstance(paths, basestring): paths = [paths] order = order or [LANGUAGE_INDEX, SERVICE_INDEX, SERVICE_CONFIDENCE, MATCHING_CONFIDENCE] - subtitles_by_video = list_subtitles(paths, set(languages), services, force, multi, cache_dir, max_depth) + subtitles_by_video = list_subtitles(paths, set(languages), services, force, multi, cache_dir, max_depth, scan_filter) for video, subtitles in subtitles_by_video.iteritems(): subtitles.sort(key=lambda s: key_subtitles(s, video, languages, services, order), reverse=True) results = [] diff --git a/libs/subliminal/async.py b/libs/subliminal/async.py index ce18a27..e125bbf 100755 --- a/libs/subliminal/async.py +++ b/libs/subliminal/async.py @@ -18,7 +18,7 @@ from .core import (consume_task, LANGUAGE_INDEX, SERVICE_INDEX, SERVICE_CONFIDENCE, MATCHING_CONFIDENCE, SERVICES, create_list_tasks, create_download_tasks, group_by_video, key_subtitles) -from .languages import list_languages +from guessit.language import ALL_LANGUAGES from .tasks import StopTask import Queue import logging @@ -108,29 +108,29 @@ class Pool(object): break return results - def list_subtitles(self, paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3): + def list_subtitles(self, paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3, scan_filter=None): """See :meth:`subliminal.list_subtitles`""" services = services or SERVICES - languages = set(languages or list_languages(1)) + languages = set(languages or ALL_LANGUAGES) if isinstance(paths, basestring): paths = [paths] if any([not isinstance(p, unicode) for p in paths]): logger.warning(u'Not all entries are unicode') - tasks = create_list_tasks(paths, languages, services, force, multi, cache_dir, max_depth) + tasks = create_list_tasks(paths, languages, services, force, multi, cache_dir, max_depth, scan_filter) for task in tasks: self.tasks.put(task) self.join() results = self.collect() return group_by_video(results) - def download_subtitles(self, paths, languages=None, services=None, cache_dir=None, max_depth=3, force=True, multi=False, order=None): + def download_subtitles(self, paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3, scan_filter=None, order=None): """See :meth:`subliminal.download_subtitles`""" services = services or SERVICES - languages = languages or list_languages(1) + languages = languages or list(ALL_LANGUAGES) if isinstance(paths, basestring): paths = [paths] order = order or [LANGUAGE_INDEX, SERVICE_INDEX, SERVICE_CONFIDENCE, MATCHING_CONFIDENCE] - subtitles_by_video = self.list_subtitles(paths, set(languages), services, force, multi, cache_dir, max_depth) + subtitles_by_video = self.list_subtitles(paths, set(languages), services, force, multi, cache_dir, max_depth, scan_filter) for video, subtitles in subtitles_by_video.iteritems(): subtitles.sort(key=lambda s: key_subtitles(s, video, languages, services, order), reverse=True) tasks = create_download_tasks(subtitles_by_video, multi) diff --git a/libs/subliminal/cache.py b/libs/subliminal/cache.py new file mode 100755 index 0000000..f4d7d94 --- /dev/null +++ b/libs/subliminal/cache.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- +# Copyright 2012 Nicolas Wack +# +# This file is part of subliminal. +# +# subliminal is free software; you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# subliminal is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with subliminal. If not, see . +import os.path +from collections import defaultdict +import threading +from functools import wraps +import logging +try: + import cPickle as pickle +except ImportError: + import pickle + + +logger = logging.getLogger(__name__) + + +class Cache(object): + """A Cache object contains cached values for methods. It can have + separate internal caches, one for each service. + """ + + def __init__(self, cache_dir): + self.cache_dir = cache_dir + self.cache = defaultdict(dict) + self.lock = threading.RLock() + + def __del__(self): + for service_name in self.cache: + self.save(service_name) + + def cache_location(self, service_name): + return os.path.join(self.cache_dir, 'subliminal_%s.cache' % service_name) + + def load(self, service_name): + with self.lock: + if service_name in self.cache: + # already loaded + return + + self.cache[service_name] = defaultdict(dict) + filename = self.cache_location(service_name) + logger.debug(u'Cache: loading cache from %s' % filename) + try: + self.cache[service_name] = pickle.load(open(filename, 'rb')) + except IOError: + logger.info('Cache: Cache file "%s" doesn\'t exist, creating it' % filename) + except EOFError: + logger.error('Cache: cache file "%s" is corrupted... Removing it.' % filename) + os.remove(filename) + + def save(self, service_name): + filename = self.cache_location(service_name) + logger.debug(u'Cache: saving cache to %s' % filename) + with self.lock: + pickle.dump(self.cache[service_name], open(filename, 'wb')) + + def clear(self, service_name): + try: + os.remove(self.cache_location(service_name)) + except OSError: + pass + self.cache[service_name] = defaultdict(dict) + + def cached_func_key(self, func, cls=None): + try: + cls = func.im_class + except: + pass + return ('%s.%s' % (cls.__module__, cls.__name__), func.__name__) + + def function_cache(self, service_name, func): + func_key = self.cached_func_key(func) + return self.cache[service_name][func_key] + + def cache_for(self, service_name, func, args, result): + # no need to lock here, dict ops are atomic + self.function_cache(service_name, func)[args] = result + + def cached_value(self, service_name, func, args): + """Raises KeyError if not found""" + # no need to lock here, dict ops are atomic + return self.function_cache(service_name, func)[args] + + +def cachedmethod(function): + """Decorator to make a method use the cache. + + WARNING: this can NOT be used with static functions, it has to be used on + methods of some class.""" + + @wraps(function) + def cached(*args): + c = args[0].config.cache + service_name = args[0].__class__.__name__ + func_key = c.cached_func_key(function, cls=args[0].__class__) + func_cache = c.cache[service_name][func_key] + + # we need to remove the first element of args for the key, as it is the + # instance pointer and we don't want the cache to know which instance + # called it, it is shared among all instances of the same class + key = args[1:] + + if key in func_cache: + result = func_cache[key] + logger.debug(u'Using cached value for %s(%s), returns: %s' % (func_key, key, result)) + return result + + result = function(*args) + + # note: another thread could have already cached a value in the + # meantime, but that's ok as we prefer to keep the latest value in + # the cache + func_cache[key] = result + + return result + + return cached diff --git a/libs/subliminal/core.py b/libs/subliminal/core.py index 56f4347..9bbc1cd 100755 --- a/libs/subliminal/core.py +++ b/libs/subliminal/core.py @@ -20,6 +20,8 @@ from .services import ServiceConfig from .tasks import DownloadTask, ListTask from .utils import get_keywords from .videos import Episode, Movie, scan +from guessit.language import lang_set +import bs4 from collections import defaultdict from itertools import groupby import guessit @@ -30,11 +32,11 @@ __all__ = ['SERVICES', 'LANGUAGE_INDEX', 'SERVICE_INDEX', 'SERVICE_CONFIDENCE', 'create_list_tasks', 'create_download_tasks', 'consume_task', 'matching_confidence', 'key_subtitles', 'group_by_video'] logger = logging.getLogger(__name__) -SERVICES = ['opensubtitles', 'bierdopje', 'subswiki', 'subtitulos', 'thesubdb'] +SERVICES = ['opensubtitles', 'bierdopje', 'subswiki', 'subtitulos', 'thesubdb', 'addic7ed', 'tvsubtitles'] LANGUAGE_INDEX, SERVICE_INDEX, SERVICE_CONFIDENCE, MATCHING_CONFIDENCE = range(4) -def create_list_tasks(paths, languages, services, force, multi, cache_dir, max_depth): +def create_list_tasks(paths, languages, services, force, multi, cache_dir, max_depth, scan_filter): """Create a list of :class:`~subliminal.tasks.ListTask` from one or more paths using the given criteria :param paths: path(s) to video file or folder @@ -45,18 +47,20 @@ def create_list_tasks(paths, languages, services, force, multi, cache_dir, max_d :param bool multi: search multiple languages for the same video :param string cache_dir: path to the cache directory to use :param int max_depth: maximum depth for scanning entries + :param function scan_filter: filter function that takes a path as argument and returns a boolean indicating whether it has to be filtered out (``True``) or not (``False``) :return: the created tasks :rtype: list of :class:`~subliminal.tasks.ListTask` """ scan_result = [] for p in paths: - scan_result.extend(scan(p, max_depth)) + scan_result.extend(scan(p, max_depth, scan_filter)) logger.debug(u'Found %d videos in %r with maximum depth %d' % (len(scan_result), paths, max_depth)) tasks = [] config = ServiceConfig(multi, cache_dir) + services = filter_services(services) for video, detected_subtitles in scan_result: - detected_languages = set([s.language for s in detected_subtitles]) + detected_languages = set(s.language for s in detected_subtitles) wanted_languages = languages.copy() if not force and multi: wanted_languages -= detected_languages @@ -70,14 +74,9 @@ def create_list_tasks(paths, languages, services, force, multi, cache_dir, max_d for service_name in services: mod = __import__('services.' + service_name, globals=globals(), locals=locals(), fromlist=['Service'], level=-1) service = mod.Service - service_languages = wanted_languages & service.available_languages() - if not service_languages: - logger.debug(u'Skipping %r: none of wanted languages %r available for service %s' % (video, wanted_languages, service_name)) + if not service.check_validity(video, wanted_languages): continue - if not service.is_valid_video(video): - logger.debug(u'Skipping %r: not part of supported videos %r for service %s' % (video, service.videos, service_name)) - continue - task = ListTask(video, service_languages, service_name, config) + task = ListTask(video, wanted_languages & service.languages, service_name, config) logger.debug(u'Created task %r' % task) tasks.append(task) return tasks @@ -128,25 +127,19 @@ def consume_task(task, services=None): logger.info(u'Consuming %r' % task) result = None if isinstance(task, ListTask): - if task.service not in services: - mod = __import__('services.' + task.service, globals=globals(), locals=locals(), fromlist=['Service'], level=-1) - services[task.service] = mod.Service(task.config) - services[task.service].init() - subtitles = services[task.service].list(task.video, task.languages) - result = subtitles + service = get_service(services, task.service, config=task.config) + result = service.list(task.video, task.languages) elif isinstance(task, DownloadTask): for subtitle in task.subtitles: - if subtitle.service not in services: - mod = __import__('services.' + subtitle.service, globals=globals(), locals=locals(), fromlist=['Service'], level=-1) - services[subtitle.service] = mod.Service() - services[subtitle.service].init() + service = get_service(services, subtitle.service) try: - services[subtitle.service].download(subtitle) + service.download(subtitle) result = subtitle break except DownloadFailedError: logger.warning(u'Could not download subtitle %r, trying next' % subtitle) continue + if result is None: logger.error(u'No subtitles could be downloaded for video %r' % task.video) return result @@ -193,6 +186,26 @@ def matching_confidence(video, subtitle): return confidence +def get_service(services, service_name, config=None): + """Get a service from its name in the service dict with the specified config. + If the service does not exist in the service dict, it is created and added to the dict. + + :param dict services: dict where to get existing services or put created ones + :param string service_name: name of the service to get + :param config: config to use for the service + :type config: :class:`~subliminal.services.ServiceConfig` or None + :return: the corresponding service + :rtype: :class:`~subliminal.services.ServiceBase` + + """ + if service_name not in services: + mod = __import__('services.' + service_name, globals=globals(), locals=locals(), fromlist=['Service'], level=-1) + services[service_name] = mod.Service() + services[service_name].init() + services[service_name].config = config + return services[service_name] + + def key_subtitles(subtitle, video, languages, services, order): """Create a key to sort subtitle using the given order @@ -238,3 +251,21 @@ def group_by_video(list_results): for video, subtitles in list_results: result[video] += subtitles return result + + +def filter_services(services): + """Filter out services that are not available because of a missing feature + + :param list services: service names to filter + :return: a copy of the initial list of service names without unavailable ones + :rtype: list + + """ + filtered_services = services[:] + for service_name in services: + mod = __import__('services.' + service_name, globals=globals(), locals=locals(), fromlist=['Service'], level=-1) + service = mod.Service + if service.required_features is not None and bs4.builder_registry.lookup(*service.required_features) is None: + logger.warning(u'Service %s not available: none of available features could be used. One of %r required' % (service_name, service.required_features)) + filtered_services.remove(service_name) + return filtered_services diff --git a/libs/subliminal/infos.py b/libs/subliminal/infos.py index b28fda0..9958234 100755 --- a/libs/subliminal/infos.py +++ b/libs/subliminal/infos.py @@ -15,4 +15,4 @@ # # You should have received a copy of the GNU Lesser General Public License # along with subliminal. If not, see . -__version__ = '0.5.1' +__version__ = '0.6.0' diff --git a/libs/subliminal/languages.py b/libs/subliminal/languages.py deleted file mode 100755 index f743953..0000000 --- a/libs/subliminal/languages.py +++ /dev/null @@ -1,547 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2011-2012 Antoine Bertin -# -# This file is part of subliminal. -# -# subliminal is free software; you can redistribute it and/or modify it under -# the terms of the GNU Lesser General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# subliminal is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public License -# along with subliminal. If not, see . -__all__ = ['convert_language', 'list_languages', 'LANGUAGES'] - - -def convert_language(language, to_iso, from_iso=None): - """Convert a language into another format - - :param string language: language - :param int to_iso: convert language to ISO-639-x - :param int from_iso: convert language from ISO-639-x - :return: converted language - :rtype: string - - """ - if from_iso == None: # if no from_iso is given, try to guess it - if language.startswith(language[:1].upper()): - from_iso = 0 - elif len(language) == 2: - from_iso = 1 - elif len(language) == 3: - from_iso = 2 - else: - raise ValueError('Invalid input language format') - if isinstance(language, unicode): - language = language.encode('utf-8') - converted_language = None - for language_tuple in LANGUAGES: - if language_tuple[from_iso] == language and language_tuple[to_iso]: - converted_language = language_tuple[to_iso] - break - return converted_language - - -def list_languages(iso): - """List languages in the given ISO-639-x format - - :param int iso: ISO-639-x format to list - :return: languages in the requested format - :rtype: list - - """ - return [l[iso] for l in LANGUAGES if l[iso]] - -#: ISO-639-2 languages list from http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt -#: + ('Brazilian', 'po', 'pob') -LANGUAGES = [('Afar', 'aa', 'aar'), - ('Abkhazian', 'ab', 'abk'), - ('Achinese', '', 'ace'), - ('Acoli', '', 'ach'), - ('Adangme', '', 'ada'), - ('Adyghe; Adygei', '', 'ady'), - ('Afro-Asiatic languages', '', 'afa'), - ('Afrihili', '', 'afh'), - ('Afrikaans', 'af', 'afr'), - ('Ainu', '', 'ain'), - ('Akan', 'ak', 'aka'), - ('Akkadian', '', 'akk'), - ('Albanian', 'sq', 'alb'), - ('Aleut', '', 'ale'), - ('Algonquian languages', '', 'alg'), - ('Southern Altai', '', 'alt'), - ('Amharic', 'am', 'amh'), - ('English, Old (ca.450-1100)', '', 'ang'), - ('Angika', '', 'anp'), - ('Apache languages', '', 'apa'), - ('Arabic', 'ar', 'ara'), - ('Official Aramaic (700-300 BCE); Imperial Aramaic (700-300 BCE)', '', 'arc'), - ('Aragonese', 'an', 'arg'), - ('Armenian', 'hy', 'arm'), - ('Mapudungun; Mapuche', '', 'arn'), - ('Arapaho', '', 'arp'), - ('Artificial languages', '', 'art'), - ('Arawak', '', 'arw'), - ('Assamese', 'as', 'asm'), - ('Asturian; Bable; Leonese; Asturleonese', '', 'ast'), - ('Athapascan languages', '', 'ath'), - ('Australian languages', '', 'aus'), - ('Avaric', 'av', 'ava'), - ('Avestan', 'ae', 'ave'), - ('Awadhi', '', 'awa'), - ('Aymara', 'ay', 'aym'), - ('Azerbaijani', 'az', 'aze'), - ('Banda languages', '', 'bad'), - ('Bamileke languages', '', 'bai'), - ('Bashkir', 'ba', 'bak'), - ('Baluchi', '', 'bal'), - ('Bambara', 'bm', 'bam'), - ('Balinese', '', 'ban'), - ('Basque', 'eu', 'baq'), - ('Basa', '', 'bas'), - ('Baltic languages', '', 'bat'), - ('Beja; Bedawiyet', '', 'bej'), - ('Belarusian', 'be', 'bel'), - ('Bemba', '', 'bem'), - ('Bengali', 'bn', 'ben'), - ('Berber languages', '', 'ber'), - ('Bhojpuri', '', 'bho'), - ('Bihari languages', 'bh', 'bih'), - ('Bikol', '', 'bik'), - ('Bini; Edo', '', 'bin'), - ('Bislama', 'bi', 'bis'), - ('Siksika', '', 'bla'), - ('Bantu (Other)', '', 'bnt'), - ('Bosnian', 'bs', 'bos'), - ('Braj', '', 'bra'), - ('Breton', 'br', 'bre'), - ('Batak languages', '', 'btk'), - ('Buriat', '', 'bua'), - ('Buginese', '', 'bug'), - ('Bulgarian', 'bg', 'bul'), - ('Burmese', 'my', 'bur'), - ('Blin; Bilin', '', 'byn'), - ('Caddo', '', 'cad'), - ('Central American Indian languages', '', 'cai'), - ('Galibi Carib', '', 'car'), - ('Catalan; Valencian', 'ca', 'cat'), - ('Caucasian languages', '', 'cau'), - ('Cebuano', '', 'ceb'), - ('Celtic languages', '', 'cel'), - ('Chamorro', 'ch', 'cha'), - ('Chibcha', '', 'chb'), - ('Chechen', 'ce', 'che'), - ('Chagatai', '', 'chg'), - ('Chinese', 'zh', 'chi'), - ('Chuukese', '', 'chk'), - ('Mari', '', 'chm'), - ('Chinook jargon', '', 'chn'), - ('Choctaw', '', 'cho'), - ('Chipewyan; Dene Suline', '', 'chp'), - ('Cherokee', '', 'chr'), - ('Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic', 'cu', 'chu'), - ('Chuvash', 'cv', 'chv'), - ('Cheyenne', '', 'chy'), - ('Chamic languages', '', 'cmc'), - ('Coptic', '', 'cop'), - ('Cornish', 'kw', 'cor'), - ('Corsican', 'co', 'cos'), - ('Creoles and pidgins, English based', '', 'cpe'), - ('Creoles and pidgins, French-based ', '', 'cpf'), - ('Creoles and pidgins, Portuguese-based ', '', 'cpp'), - ('Cree', 'cr', 'cre'), - ('Crimean Tatar; Crimean Turkish', '', 'crh'), - ('Creoles and pidgins ', '', 'crp'), - ('Kashubian', '', 'csb'), - ('Cushitic languages', '', 'cus'), - ('Czech', 'cs', 'cze'), - ('Dakota', '', 'dak'), - ('Danish', 'da', 'dan'), - ('Dargwa', '', 'dar'), - ('Land Dayak languages', '', 'day'), - ('Delaware', '', 'del'), - ('Slave (Athapascan)', '', 'den'), - ('Dogrib', '', 'dgr'), - ('Dinka', '', 'din'), - ('Divehi; Dhivehi; Maldivian', 'dv', 'div'), - ('Dogri', '', 'doi'), - ('Dravidian languages', '', 'dra'), - ('Lower Sorbian', '', 'dsb'), - ('Duala', '', 'dua'), - ('Dutch, Middle (ca.1050-1350)', '', 'dum'), - ('Dutch; Flemish', 'nl', 'dut'), - ('Dyula', '', 'dyu'), - ('Dzongkha', 'dz', 'dzo'), - ('Efik', '', 'efi'), - ('Egyptian (Ancient)', '', 'egy'), - ('Ekajuk', '', 'eka'), - ('Elamite', '', 'elx'), - ('English', 'en', 'eng'), - ('English, Middle (1100-1500)', '', 'enm'), - ('Esperanto', 'eo', 'epo'), - ('Estonian', 'et', 'est'), - ('Ewe', 'ee', 'ewe'), - ('Ewondo', '', 'ewo'), - ('Fang', '', 'fan'), - ('Faroese', 'fo', 'fao'), - ('Fanti', '', 'fat'), - ('Fijian', 'fj', 'fij'), - ('Filipino; Pilipino', '', 'fil'), - ('Finnish', 'fi', 'fin'), - ('Finno-Ugrian languages', '', 'fiu'), - ('Fon', '', 'fon'), - ('French', 'fr', 'fre'), - ('French, Middle (ca.1400-1600)', '', 'frm'), - ('French, Old (842-ca.1400)', '', 'fro'), - ('Northern Frisian', '', 'frr'), - ('Eastern Frisian', '', 'frs'), - ('Western Frisian', 'fy', 'fry'), - ('Fulah', 'ff', 'ful'), - ('Friulian', '', 'fur'), - ('Ga', '', 'gaa'), - ('Gayo', '', 'gay'), - ('Gbaya', '', 'gba'), - ('Germanic languages', '', 'gem'), - ('Georgian', 'ka', 'geo'), - ('German', 'de', 'ger'), - ('Geez', '', 'gez'), - ('Gilbertese', '', 'gil'), - ('Gaelic; Scottish Gaelic', 'gd', 'gla'), - ('Irish', 'ga', 'gle'), - ('Galician', 'gl', 'glg'), - ('Manx', 'gv', 'glv'), - ('German, Middle High (ca.1050-1500)', '', 'gmh'), - ('German, Old High (ca.750-1050)', '', 'goh'), - ('Gondi', '', 'gon'), - ('Gorontalo', '', 'gor'), - ('Gothic', '', 'got'), - ('Grebo', '', 'grb'), - ('Greek, Ancient (to 1453)', '', 'grc'), - ('Greek, Modern (1453-)', 'el', 'gre'), - ('Guarani', 'gn', 'grn'), - ('Swiss German; Alemannic; Alsatian', '', 'gsw'), - ('Gujarati', 'gu', 'guj'), - ('Gwich\'in', '', 'gwi'), - ('Haida', '', 'hai'), - ('Haitian; Haitian Creole', 'ht', 'hat'), - ('Hausa', 'ha', 'hau'), - ('Hawaiian', '', 'haw'), - ('Hebrew', 'he', 'heb'), - ('Herero', 'hz', 'her'), - ('Hiligaynon', '', 'hil'), - ('Himachali languages; Western Pahari languages', '', 'him'), - ('Hindi', 'hi', 'hin'), - ('Hittite', '', 'hit'), - ('Hmong; Mong', '', 'hmn'), - ('Hiri Motu', 'ho', 'hmo'), - ('Croatian', 'hr', 'hrv'), - ('Upper Sorbian', '', 'hsb'), - ('Hungarian', 'hu', 'hun'), - ('Hupa', '', 'hup'), - ('Iban', '', 'iba'), - ('Igbo', 'ig', 'ibo'), - ('Icelandic', 'is', 'ice'), - ('Ido', 'io', 'ido'), - ('Sichuan Yi; Nuosu', 'ii', 'iii'), - ('Ijo languages', '', 'ijo'), - ('Inuktitut', 'iu', 'iku'), - ('Interlingue; Occidental', 'ie', 'ile'), - ('Iloko', '', 'ilo'), - ('Interlingua (International Auxiliary Language Association)', 'ia', 'ina'), - ('Indic languages', '', 'inc'), - ('Indonesian', 'id', 'ind'), - ('Indo-European languages', '', 'ine'), - ('Ingush', '', 'inh'), - ('Inupiaq', 'ik', 'ipk'), - ('Iranian languages', '', 'ira'), - ('Iroquoian languages', '', 'iro'), - ('Italian', 'it', 'ita'), - ('Javanese', 'jv', 'jav'), - ('Lojban', '', 'jbo'), - ('Japanese', 'ja', 'jpn'), - ('Judeo-Persian', '', 'jpr'), - ('Judeo-Arabic', '', 'jrb'), - ('Kara-Kalpak', '', 'kaa'), - ('Kabyle', '', 'kab'), - ('Kachin; Jingpho', '', 'kac'), - ('Kalaallisut; Greenlandic', 'kl', 'kal'), - ('Kamba', '', 'kam'), - ('Kannada', 'kn', 'kan'), - ('Karen languages', '', 'kar'), - ('Kashmiri', 'ks', 'kas'), - ('Kanuri', 'kr', 'kau'), - ('Kawi', '', 'kaw'), - ('Kazakh', 'kk', 'kaz'), - ('Kabardian', '', 'kbd'), - ('Khasi', '', 'kha'), - ('Khoisan languages', '', 'khi'), - ('Central Khmer', 'km', 'khm'), - ('Khotanese; Sakan', '', 'kho'), - ('Kikuyu; Gikuyu', 'ki', 'kik'), - ('Kinyarwanda', 'rw', 'kin'), - ('Kirghiz; Kyrgyz', 'ky', 'kir'), - ('Kimbundu', '', 'kmb'), - ('Konkani', '', 'kok'), - ('Komi', 'kv', 'kom'), - ('Kongo', 'kg', 'kon'), - ('Korean', 'ko', 'kor'), - ('Kosraean', '', 'kos'), - ('Kpelle', '', 'kpe'), - ('Karachay-Balkar', '', 'krc'), - ('Karelian', '', 'krl'), - ('Kru languages', '', 'kro'), - ('Kurukh', '', 'kru'), - ('Kuanyama; Kwanyama', 'kj', 'kua'), - ('Kumyk', '', 'kum'), - ('Kurdish', 'ku', 'kur'), - ('Kutenai', '', 'kut'), - ('Ladino', '', 'lad'), - ('Lahnda', '', 'lah'), - ('Lamba', '', 'lam'), - ('Lao', 'lo', 'lao'), - ('Latin', 'la', 'lat'), - ('Latvian', 'lv', 'lav'), - ('Lezghian', '', 'lez'), - ('Limburgan; Limburger; Limburgish', 'li', 'lim'), - ('Lingala', 'ln', 'lin'), - ('Lithuanian', 'lt', 'lit'), - ('Mongo', '', 'lol'), - ('Lozi', '', 'loz'), - ('Luxembourgish; Letzeburgesch', 'lb', 'ltz'), - ('Luba-Lulua', '', 'lua'), - ('Luba-Katanga', 'lu', 'lub'), - ('Ganda', 'lg', 'lug'), - ('Luiseno', '', 'lui'), - ('Lunda', '', 'lun'), - ('Luo (Kenya and Tanzania)', '', 'luo'), - ('Lushai', '', 'lus'), - ('Macedonian', 'mk', 'mac'), - ('Madurese', '', 'mad'), - ('Magahi', '', 'mag'), - ('Marshallese', 'mh', 'mah'), - ('Maithili', '', 'mai'), - ('Makasar', '', 'mak'), - ('Malayalam', 'ml', 'mal'), - ('Mandingo', '', 'man'), - ('Maori', 'mi', 'mao'), - ('Austronesian languages', '', 'map'), - ('Marathi', 'mr', 'mar'), - ('Masai', '', 'mas'), - ('Malay', 'ms', 'may'), - ('Moksha', '', 'mdf'), - ('Mandar', '', 'mdr'), - ('Mende', '', 'men'), - ('Irish, Middle (900-1200)', '', 'mga'), - ('Mi\'kmaq; Micmac', '', 'mic'), - ('Minangkabau', '', 'min'), - ('Uncoded languages', '', 'mis'), - ('Mon-Khmer languages', '', 'mkh'), - ('Malagasy', 'mg', 'mlg'), - ('Maltese', 'mt', 'mlt'), - ('Manchu', '', 'mnc'), - ('Manipuri', '', 'mni'), - ('Manobo languages', '', 'mno'), - ('Mohawk', '', 'moh'), - ('Mongolian', 'mn', 'mon'), - ('Mossi', '', 'mos'), - ('Multiple languages', '', 'mul'), - ('Munda languages', '', 'mun'), - ('Creek', '', 'mus'), - ('Mirandese', '', 'mwl'), - ('Marwari', '', 'mwr'), - ('Mayan languages', '', 'myn'), - ('Erzya', '', 'myv'), - ('Nahuatl languages', '', 'nah'), - ('North American Indian languages', '', 'nai'), - ('Neapolitan', '', 'nap'), - ('Nauru', 'na', 'nau'), - ('Navajo; Navaho', 'nv', 'nav'), - ('Ndebele, South; South Ndebele', 'nr', 'nbl'), - ('Ndebele, North; North Ndebele', 'nd', 'nde'), - ('Ndonga', 'ng', 'ndo'), - ('Low German; Low Saxon; German, Low; Saxon, Low', '', 'nds'), - ('Nepali', 'ne', 'nep'), - ('Nepal Bhasa; Newari', '', 'new'), - ('Nias', '', 'nia'), - ('Niger-Kordofanian languages', '', 'nic'), - ('Niuean', '', 'niu'), - ('Norwegian Nynorsk; Nynorsk, Norwegian', 'nn', 'nno'), - ('Bokmål, Norwegian; Norwegian Bokmål', 'nb', 'nob'), - ('Nogai', '', 'nog'), - ('Norse, Old', '', 'non'), - ('Norwegian', 'no', 'nor'), - ('N\'Ko', '', 'nqo'), - ('Pedi; Sepedi; Northern Sotho', '', 'nso'), - ('Nubian languages', '', 'nub'), - ('Classical Newari; Old Newari; Classical Nepal Bhasa', '', 'nwc'), - ('Chichewa; Chewa; Nyanja', 'ny', 'nya'), - ('Nyamwezi', '', 'nym'), - ('Nyankole', '', 'nyn'), - ('Nyoro', '', 'nyo'), - ('Nzima', '', 'nzi'), - ('Occitan (post 1500); Provençal', 'oc', 'oci'), - ('Ojibwa', 'oj', 'oji'), - ('Oriya', 'or', 'ori'), - ('Oromo', 'om', 'orm'), - ('Osage', '', 'osa'), - ('Ossetian; Ossetic', 'os', 'oss'), - ('Turkish, Ottoman (1500-1928)', '', 'ota'), - ('Otomian languages', '', 'oto'), - ('Papuan languages', '', 'paa'), - ('Pangasinan', '', 'pag'), - ('Pahlavi', '', 'pal'), - ('Pampanga; Kapampangan', '', 'pam'), - ('Panjabi; Punjabi', 'pa', 'pan'), - ('Papiamento', '', 'pap'), - ('Palauan', '', 'pau'), - ('Persian, Old (ca.600-400 B.C.)', '', 'peo'), - ('Persian', 'fa', 'per'), - ('Philippine languages', '', 'phi'), - ('Phoenician', '', 'phn'), - ('Pali', 'pi', 'pli'), - ('Polish', 'pl', 'pol'), - ('Pohnpeian', '', 'pon'), - ('Portuguese', 'pt', 'por'), - ('Prakrit languages', '', 'pra'), - ('Provençal, Old (to 1500)', '', 'pro'), - ('Pushto; Pashto', 'ps', 'pus'), - ('Reserved for local use', '', 'qaa-qtz'), - ('Quechua', 'qu', 'que'), - ('Rajasthani', '', 'raj'), - ('Rapanui', '', 'rap'), - ('Rarotongan; Cook Islands Maori', '', 'rar'), - ('Romance languages', '', 'roa'), - ('Romansh', 'rm', 'roh'), - ('Romany', '', 'rom'), - ('Romanian; Moldavian; Moldovan', 'ro', 'rum'), - ('Rundi', 'rn', 'run'), - ('Aromanian; Arumanian; Macedo-Romanian', '', 'rup'), - ('Russian', 'ru', 'rus'), - ('Sandawe', '', 'sad'), - ('Sango', 'sg', 'sag'), - ('Yakut', '', 'sah'), - ('South American Indian (Other)', '', 'sai'), - ('Salishan languages', '', 'sal'), - ('Samaritan Aramaic', '', 'sam'), - ('Sanskrit', 'sa', 'san'), - ('Sasak', '', 'sas'), - ('Santali', '', 'sat'), - ('Sicilian', '', 'scn'), - ('Scots', '', 'sco'), - ('Selkup', '', 'sel'), - ('Semitic languages', '', 'sem'), - ('Irish, Old (to 900)', '', 'sga'), - ('Sign Languages', '', 'sgn'), - ('Shan', '', 'shn'), - ('Sidamo', '', 'sid'), - ('Sinhala; Sinhalese', 'si', 'sin'), - ('Siouan languages', '', 'sio'), - ('Sino-Tibetan languages', '', 'sit'), - ('Slavic languages', '', 'sla'), - ('Slovak', 'sk', 'slo'), - ('Slovenian', 'sl', 'slv'), - ('Southern Sami', '', 'sma'), - ('Northern Sami', 'se', 'sme'), - ('Sami languages', '', 'smi'), - ('Lule Sami', '', 'smj'), - ('Inari Sami', '', 'smn'), - ('Samoan', 'sm', 'smo'), - ('Skolt Sami', '', 'sms'), - ('Shona', 'sn', 'sna'), - ('Sindhi', 'sd', 'snd'), - ('Soninke', '', 'snk'), - ('Sogdian', '', 'sog'), - ('Somali', 'so', 'som'), - ('Songhai languages', '', 'son'), - ('Sotho, Southern', 'st', 'sot'), - ('Spanish; Castilian', 'es', 'spa'), - ('Sardinian', 'sc', 'srd'), - ('Sranan Tongo', '', 'srn'), - ('Serbian', 'sr', 'srp'), - ('Serer', '', 'srr'), - ('Nilo-Saharan languages', '', 'ssa'), - ('Swati', 'ss', 'ssw'), - ('Sukuma', '', 'suk'), - ('Sundanese', 'su', 'sun'), - ('Susu', '', 'sus'), - ('Sumerian', '', 'sux'), - ('Swahili', 'sw', 'swa'), - ('Swedish', 'sv', 'swe'), - ('Classical Syriac', '', 'syc'), - ('Syriac', '', 'syr'), - ('Tahitian', 'ty', 'tah'), - ('Tai languages', '', 'tai'), - ('Tamil', 'ta', 'tam'), - ('Tatar', 'tt', 'tat'), - ('Telugu', 'te', 'tel'), - ('Timne', '', 'tem'), - ('Tereno', '', 'ter'), - ('Tetum', '', 'tet'), - ('Tajik', 'tg', 'tgk'), - ('Tagalog', 'tl', 'tgl'), - ('Thai', 'th', 'tha'), - ('Tibetan', 'bo', 'tib'), - ('Tigre', '', 'tig'), - ('Tigrinya', 'ti', 'tir'), - ('Tiv', '', 'tiv'), - ('Tokelau', '', 'tkl'), - ('Klingon; tlhIngan-Hol', '', 'tlh'), - ('Tlingit', '', 'tli'), - ('Tamashek', '', 'tmh'), - ('Tonga (Nyasa)', '', 'tog'), - ('Tonga (Tonga Islands)', 'to', 'ton'), - ('Tok Pisin', '', 'tpi'), - ('Tsimshian', '', 'tsi'), - ('Tswana', 'tn', 'tsn'), - ('Tsonga', 'ts', 'tso'), - ('Turkmen', 'tk', 'tuk'), - ('Tumbuka', '', 'tum'), - ('Tupi languages', '', 'tup'), - ('Turkish', 'tr', 'tur'), - ('Altaic languages', '', 'tut'), - ('Tuvalu', '', 'tvl'), - ('Twi', 'tw', 'twi'), - ('Tuvinian', '', 'tyv'), - ('Udmurt', '', 'udm'), - ('Ugaritic', '', 'uga'), - ('Uighur; Uyghur', 'ug', 'uig'), - ('Ukrainian', 'uk', 'ukr'), - ('Umbundu', '', 'umb'), - ('Undetermined', '', 'und'), - ('Urdu', 'ur', 'urd'), - ('Uzbek', 'uz', 'uzb'), - ('Vai', '', 'vai'), - ('Venda', 've', 'ven'), - ('Vietnamese', 'vi', 'vie'), - ('Volapük', 'vo', 'vol'), - ('Votic', '', 'vot'), - ('Wakashan languages', '', 'wak'), - ('Walamo', '', 'wal'), - ('Waray', '', 'war'), - ('Washo', '', 'was'), - ('Welsh', 'cy', 'wel'), - ('Sorbian languages', '', 'wen'), - ('Walloon', 'wa', 'wln'), - ('Wolof', 'wo', 'wol'), - ('Kalmyk; Oirat', '', 'xal'), - ('Xhosa', 'xh', 'xho'), - ('Yao', '', 'yao'), - ('Yapese', '', 'yap'), - ('Yiddish', 'yi', 'yid'), - ('Yoruba', 'yo', 'yor'), - ('Yupik languages', '', 'ypk'), - ('Zapotec', '', 'zap'), - ('Blissymbols; Blissymbolics; Bliss', '', 'zbl'), - ('Zenaga', '', 'zen'), - ('Zhuang; Chuang', 'za', 'zha'), - ('Zande languages', '', 'znd'), - ('Zulu', 'zu', 'zul'), - ('Zuni', '', 'zun'), - ('No linguistic content; Not applicable', '', 'zxx'), - ('Zaza; Dimili; Dimli; Kirdki; Kirmanjki; Zazaki', '', 'zza'), - ('Brazilian', 'po', 'pob')] diff --git a/libs/subliminal/services/__init__.py b/libs/subliminal/services/__init__.py index 67e457f..e1aea2d 100755 --- a/libs/subliminal/services/__init__.py +++ b/libs/subliminal/services/__init__.py @@ -15,11 +15,15 @@ # # You should have received a copy of the GNU Lesser General Public License # along with subliminal. If not, see . -from ..exceptions import MissingLanguageError, DownloadFailedError +from .. import cache +from ..exceptions import MissingLanguageError, DownloadFailedError, ServiceError +from ..subtitles import EXTENSIONS +from guessit.language import lang_set, UNDETERMINED import logging import os import requests import threading +import zipfile __all__ = ['ServiceBase', 'ServiceConfig'] @@ -37,7 +41,7 @@ class ServiceBase(object): server_url = '' #: User Agent for any HTTP-based requests - user_agent = 'subliminal v0.5' + user_agent = 'subliminal v0.6' #: Whether based on an API or not api_based = False @@ -45,21 +49,18 @@ class ServiceBase(object): #: Timeout for web requests timeout = 5 - #: Lock for cache interactions - lock = threading.Lock() - #: Mapping to Service's language codes and subliminal's languages = {} - #: Whether the mapping is reverted or not - reverted_languages = False - #: Accepted video classes (:class:`~subliminal.videos.Episode`, :class:`~subliminal.videos.Movie`, :class:`~subliminal.videos.UnknownVideo`) videos = [] #: Whether the video has to exist or not require_video = False + #: List of required features for BeautifulSoup + required_features = None + def __init__(self, config=None): self.config = config or ServiceConfig() @@ -75,6 +76,30 @@ class ServiceBase(object): logger.debug(u'Initializing %s' % self.__class__.__name__) self.session = requests.session(timeout=10, headers={'User-Agent': self.user_agent}) + def init_cache(self): + """Initialize cache, make sure it is loaded from disk""" + if not self.config or not self.config.cache: + raise ServiceError('Cache directory is required') + + service_name = self.__class__.__name__ + self.config.cache.load(service_name) + + def save_cache(self): + service_name = self.__class__.__name__ + self.config.cache.save(service_name) + + def clear_cache(self): + service_name = self.__class__.__name__ + self.config.cache.clear(service_name) + + def cache_for(self, func, args, result): + service_name = self.__class__.__name__ + return self.config.cache.cache_for(service_name, func, args, result) + + def cached_value(self, func, args): + service_name = self.__class__.__name__ + return self.config.cache.cached_value(service_name, func, args) + def terminate(self): """Terminate connection""" logger.debug(u'Terminating %s' % self.__class__.__name__) @@ -84,26 +109,21 @@ class ServiceBase(object): pass def list(self, video, languages): - """List subtitles""" - pass + """List subtitles + + As a service writer, you can either override this method or implement + :meth:`list_checked` instead to have the languages pre-filtered for you + + """ + if not self.check_validity(video, languages): + return [] + return self.list_checked(video, languages) def download(self, subtitle): """Download a subtitle""" self.download_file(subtitle.link, subtitle.path) @classmethod - def available_languages(cls): - """Available languages in the Service - - :return: available languages - :rtype: set - - """ - if not cls.reverted_languages: - return set(cls.languages.keys()) - return set(cls.languages.values()) - - @classmethod def check_validity(cls, video, languages): """Check for video and languages validity in the Service @@ -113,72 +133,15 @@ class ServiceBase(object): :rtype: bool """ - languages &= cls.available_languages() + languages = (lang_set(languages) & cls.languages) - set([UNDETERMINED]) if not languages: - logger.debug(u'No language available for service %s' % cls.__class__.__name__.lower()) - return False - if not cls.is_valid_video(video): - logger.debug(u'%r is not valid for service %s' % (video, cls.__class__.__name__.lower())) - return False - return True - - @classmethod - def is_valid_video(cls, video): - """Check if video is valid in the Service - - :param video: the video to check - :type video: :class:`~subliminal.videos.Video` - :rtype: bool - - """ - if cls.require_video and not video.exists: + logger.debug(u'No language available for service %s' % cls.__name__.lower()) return False - if not isinstance(video, tuple(cls.videos)): + if cls.require_video and not video.exists or not isinstance(video, tuple(cls.videos)): + logger.debug(u'%r is not valid for service %s' % (video, cls.__name__.lower())) return False return True - @classmethod - def is_valid_language(cls, language): - """Check if language is valid in the Service - - :param string language: the language to check - :rtype: bool - - """ - if language in cls.available_languages(): - return True - return False - - @classmethod - def get_revert_language(cls, language): - """ISO-639-1 language code from service language code - - :param string language: service language code - :return: ISO-639-1 language code - :rtype: string - - """ - if not cls.reverted_languages and language in cls.languages.values(): - return [k for k, v in cls.languages.iteritems() if v == language][0] - if cls.reverted_languages and language in cls.languages.keys(): - return cls.languages[language] - raise MissingLanguageError(language) - - @classmethod - def get_language(cls, language): - """Service language code from ISO-639-1 language code - - :param string language: ISO-639-1 language code - :return: service language code - :rtype: string - - """ - if not cls.reverted_languages and language in cls.languages.keys(): - return cls.languages[language] - if cls.reverted_languages and language in cls.languages.values(): - return [k for k, v in cls.languages.iteritems() if v == language][0] - raise MissingLanguageError(language) - def download_file(self, url, filepath): """Attempt to download a file and remove it in case of failure @@ -198,6 +161,43 @@ class ServiceBase(object): raise DownloadFailedError(str(e)) logger.debug(u'Download finished for file %s. Size: %s' % (filepath, os.path.getsize(filepath))) + def download_zip_file(self, url, filepath): + """Attempt to download a zip file and extract any subtitle file from it, if any. + This cleans up after itself if anything fails. + + :param string url: URL of the zip file to download + :param string filepath: destination path for the subtitle + + """ + logger.info(u'Downloading %s' % url) + try: + zippath = filepath + '.zip' + r = self.session.get(url, headers={'Referer': url, 'User-Agent': self.user_agent}) + with open(zippath, 'wb') as f: + f.write(r.content) + if not zipfile.is_zipfile(zippath): + # TODO: could check if maybe we already have a text file and + # download it directly + raise DownloadFailedError('Downloaded file is not a zip file') + zipsub = zipfile.ZipFile(zippath) + for subfile in zipsub.namelist(): + if os.path.splitext(subfile)[1] in EXTENSIONS: + open(filepath, 'w').write(zipsub.open(subfile).read()) + break + else: + logger.debug(u'No subtitles found in zip file') + raise DownloadFailedError('No subtitles found in zip file') + os.remove(zippath) + logger.debug(u'Download finished for file %s. Size: %s' % (filepath, os.path.getsize(filepath))) + return + except Exception as e: + logger.error(u'Download %s failed: %s' % (url, e)) + if os.path.exists(zippath): + os.remove(zippath) + if os.path.exists(filepath): + os.remove(filepath) + raise DownloadFailedError(str(e)) + class ServiceConfig(object): """Configuration for any :class:`Service` @@ -209,6 +209,9 @@ class ServiceConfig(object): def __init__(self, multi=False, cache_dir=None): self.multi = multi self.cache_dir = cache_dir + self.cache = None + if cache_dir is not None: + self.cache = cache.Cache(cache_dir) def __repr__(self): - return 'ServiceConfig(%r, %s)' % (self.multi, self.cache_dir) + return 'ServiceConfig(%r, %s)' % (self.multi, self.cache.cache_dir) diff --git a/libs/subliminal/services/addic7ed.py b/libs/subliminal/services/addic7ed.py new file mode 100755 index 0000000..c754de5 --- /dev/null +++ b/libs/subliminal/services/addic7ed.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- +# Copyright 2012 Olivier Leveau +# +# This file is part of subliminal. +# +# subliminal is free software; you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# subliminal is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with subliminal. If not, see . +from . import ServiceBase +from ..cache import cachedmethod +from ..subtitles import get_subtitle_path, ResultSubtitle +from ..videos import Episode +from bs4 import BeautifulSoup +from guessit.language import lang_set +from subliminal.utils import get_keywords +import guessit +import logging +import re + + +logger = logging.getLogger(__name__) + + +def match(pattern, string): + try: + return re.search(pattern, string).group(1) + except AttributeError: + logger.debug(u'Could not match %r on %r' % (pattern, string)) + return None + + +def matches(pattern, string): + try: + return re.search(pattern, string).group(1, 2) + except AttributeError: + logger.debug(u'Could not match %r on %r' % (pattern, string)) + return None + + +class Addic7ed(ServiceBase): + server_url = 'http://www.addic7ed.com' + api_based = False + languages = lang_set([u'English', u'Italian', u'Portuguese', + u'Portuguese (Brazilian)', u'Romanian', + u'Spanish', u'French', u'Greek', u'Arabic', + u'German', u'Croatian', u'Indonesian', u'Hebrew', + u'Russian', u'Turkish', u'Swedish', u'Czech', + u'Dutch', u'Hungarian', u'Norwegian', u'Polish', + u'Persian'], strict=True) + videos = [Episode] + require_video = False + required_features = ['permissive'] + + @cachedmethod + def get_likely_series_id(self, name): + r = self.session.get('%s/shows.php' % self.server_url) + soup = BeautifulSoup(r.content, self.required_features) + for elem in soup.find_all('h3'): + show_name = elem.a.text.lower() + show_id = int(match('show/([0-9]+)', elem.a['href'])) + # we could just return the id of the queried show, but as we + # already downloaded the whole page we might as well fill in the + # information for all the shows + self.cache_for(self.get_likely_series_id, args=(show_name,), result=show_id) + return self.cached_value(self.get_likely_series_id, args=(name,)) + + @cachedmethod + def get_episode_url(self, series_id, season, number): + """Get the Addic7ed id for the given episode. Raises KeyError if none + could be found + + """ + # download the page of the show, contains ids for all episodes all seasons + r = self.session.get('%s/show/%d' % (self.server_url, series_id)) + soup = BeautifulSoup(r.content, self.required_features) + form = soup.find('form', attrs={'name': 'multidl'}) + for table in form.find_all('table'): + for row in table.find_all('tr'): + cell = row.find('td', 'MultiDldS') + if not cell: + continue + m = matches('/serie/.+/([0-9]+)/([0-9]+)/', cell.a['href']) + if not m: + continue + episode_url = cell.a['href'] + season_number = int(m[0]) + episode_number = int(m[1]) + # we could just return the url of the queried episode, but as we + # already downloaded the whole page we might as well fill in the + # information for all the episodes of the show + self.cache_for(self.get_episode_url, args=(series_id, season_number, episode_number), result=episode_url) + # raises KeyError if not found + return self.cached_value(self.get_episode_url, args=(series_id, season, number)) + + # Do not cache this method in order to always check for the most recent + # subtitles + def get_sub_urls(self, episode_url): + suburls = [] + r = self.session.get('%s/%s' % (self.server_url, episode_url)) + epsoup = BeautifulSoup(r.content, self.required_features) + for releaseTable in epsoup.find_all('table', 'tabel95'): + releaseRow = releaseTable.find('td', 'NewsTitle') + if not releaseRow: + continue + release = releaseRow.text.strip() + for row in releaseTable.find_all('tr'): + link = row.find('a', 'buttonDownload') + if not link: + continue + if 'href' not in link.attrs or not (link['href'].startswith('/original') or link['href'].startswith('/updated')): + continue + suburl = link['href'] + lang = guessit.Language(row.find('td', 'language').text.strip()) + result = {'suburl': suburl, 'language': lang, 'release': release} + suburls.append(result) + return suburls + + def list_checked(self, video, languages): + return self.query(video.path or video.release, languages, get_keywords(video.guess), video.series, video.season, video.episode) + + def query(self, filepath, languages, keywords, series, season, episode): + logger.debug(u'Getting subtitles for %s season %d episode %d with languages %r' % (series, season, episode, languages)) + self.init_cache() + try: + sid = self.get_likely_series_id(series.lower()) + except KeyError: + logger.debug(u'Could not find series id for %s' % series) + return [] + + try: + ep_url = self.get_episode_url(sid, season, episode) + except KeyError: + logger.debug(u'Could not find episode id for %s season %d episode %d' % (series, season, episode)) + return [] + suburls = self.get_sub_urls(ep_url) + + # filter the subtitles with our queried languages + subtitles = [] + for suburl in suburls: + language = suburl['language'] + if language not in languages: + continue + + path = get_subtitle_path(filepath, language, self.config.multi) + subtitle = ResultSubtitle(path, language, self.__class__.__name__.lower(), + '%s/%s' % (self.server_url, suburl['suburl']), + keywords=[suburl['release']]) + subtitles.append(subtitle) + return subtitles + + +Service = Addic7ed diff --git a/libs/subliminal/services/bierdopje.py b/libs/subliminal/services/bierdopje.py index 15401ad..ab6c577 100755 --- a/libs/subliminal/services/bierdopje.py +++ b/libs/subliminal/services/bierdopje.py @@ -16,13 +16,14 @@ # You should have received a copy of the GNU Lesser General Public License # along with subliminal. If not, see . from . import ServiceBase +from ..cache import cachedmethod from ..exceptions import ServiceError from ..subtitles import get_subtitle_path, ResultSubtitle -from ..videos import Episode from ..utils import to_unicode -import BeautifulSoup +from ..videos import Episode +from bs4 import BeautifulSoup +from guessit.language import lang_set import logging -import os.path import urllib try: import cPickle as pickle @@ -36,30 +37,23 @@ logger = logging.getLogger(__name__) class BierDopje(ServiceBase): server_url = 'http://api.bierdopje.com/A2B638AC5D804C2E/' api_based = True - languages = {'en': 'en', 'nl': 'nl'} - reverted_languages = False + languages = lang_set(['en', 'nl']) videos = [Episode] require_video = False + required_features = ['xml'] - def __init__(self, config=None): - super(BierDopje, self).__init__(config) - self.showids = {} - if self.config and self.config.cache_dir: - self.init_cache() + @cachedmethod + def get_show_id(self, series): + r = self.session.get('%sGetShowByName/%s' % (self.server_url, urllib.quote(series.lower()))) + if r.status_code != 200: + logger.error(u'Request %s returned status code %d' % (r.url, r.status_code)) + return None + soup = BeautifulSoup(r.content, self.required_features) + if soup.status.contents[0] == 'false': + logger.debug(u'Could not find show %s' % series) + return None - def init_cache(self): - logger.debug(u'Initializing cache...') - if not self.config or not self.config.cache_dir: - raise ServiceError('Cache directory is required') - self.showids_cache = os.path.join(self.config.cache_dir, 'bierdopje_showids.cache') - if not os.path.exists(self.showids_cache): - self.save_cache() - - def save_cache(self): - logger.debug(u'Saving showids to cache...') - with self.lock: - with open(self.showids_cache, 'w') as f: - pickle.dump(self.showids, f) + return int(soup.showid.contents[0]) def load_cache(self): logger.debug(u'Loading showids from cache...') @@ -67,25 +61,12 @@ class BierDopje(ServiceBase): with open(self.showids_cache, 'r') as f: self.showids = pickle.load(f) - def query(self, season, episode, languages, filepath, tvdbid=None, series=None): - self.load_cache() + def query(self, filepath, season, episode, languages, tvdbid=None, series=None): + self.init_cache() if series: - if series.lower() in self.showids: # from cache - request_id = self.showids[series.lower()] - logger.debug(u'Retreived showid %d for %s from cache' % (request_id, series)) - else: # query to get showid - logger.debug(u'Getting showid from show name %s...' % series) - r = self.session.get('%sGetShowByName/%s' % (self.server_url, urllib.quote(series.lower()))) - if r.status_code != 200: - logger.error(u'Request %s returned status code %d' % (r.url, r.status_code)) - return [] - soup = BeautifulSoup.BeautifulStoneSoup(r.content) - if soup.status.contents[0] == 'false': - logger.debug(u'Could not find show %s' % series) - return [] - request_id = int(soup.showid.contents[0]) - self.showids[series.lower()] = request_id - self.save_cache() + request_id = self.get_show_id(series.lower()) + if request_id is None: + return [] request_source = 'showid' request_is_tvdbid = 'false' elif tvdbid: @@ -96,14 +77,14 @@ class BierDopje(ServiceBase): raise ServiceError('One or more parameter missing') subtitles = [] for language in languages: - logger.debug(u'Getting subtitles for %s %d season %d episode %d with language %s' % (request_source, request_id, season, episode, language)) - r = self.session.get('%sGetAllSubsFor/%s/%s/%s/%s/%s' % (self.server_url, request_id, season, episode, language, request_is_tvdbid)) + logger.debug(u'Getting subtitles for %s %d season %d episode %d with language %s' % (request_source, request_id, season, episode, language.alpha2)) + r = self.session.get('%sGetAllSubsFor/%s/%s/%s/%s/%s' % (self.server_url, request_id, season, episode, language.alpha2, request_is_tvdbid)) if r.status_code != 200: logger.error(u'Request %s returned status code %d' % (r.url, r.status_code)) return [] - soup = BeautifulSoup.BeautifulStoneSoup(r.content) + soup = BeautifulSoup(r.content, self.required_features) if soup.status.contents[0] == 'false': - logger.debug(u'Could not find subtitles for %s %d season %d episode %d with language %s' % (request_source, request_id, season, episode, language)) + logger.debug(u'Could not find subtitles for %s %d season %d episode %d with language %s' % (request_source, request_id, season, episode, language.alpha2)) continue path = get_subtitle_path(filepath, language, self.config.multi) for result in soup.results('result'): @@ -112,11 +93,8 @@ class BierDopje(ServiceBase): subtitles.append(subtitle) return subtitles - def list(self, video, languages): - if not self.check_validity(video, languages): - return [] - results = self.query(video.season, video.episode, languages, video.path or video.release, video.tvdbid, video.series) - return results + def list_checked(self, video, languages): + return self.query(video.path or video.release, video.season, video.episode, languages, video.tvdbid, video.series) Service = BierDopje diff --git a/libs/subliminal/services/opensubtitles.py b/libs/subliminal/services/opensubtitles.py index 9dee27b..cf49380 100755 --- a/libs/subliminal/services/opensubtitles.py +++ b/libs/subliminal/services/opensubtitles.py @@ -18,8 +18,10 @@ from . import ServiceBase from ..exceptions import ServiceError, DownloadFailedError from ..subtitles import get_subtitle_path, ResultSubtitle -from ..videos import Episode, Movie from ..utils import to_unicode +from ..videos import Episode, Movie +from guessit.language import lang_set +import guessit import gzip import logging import os.path @@ -32,34 +34,71 @@ logger = logging.getLogger(__name__) class OpenSubtitles(ServiceBase): server_url = 'http://api.opensubtitles.org/xml-rpc' api_based = True - languages = {'aa': 'aar', 'ab': 'abk', 'af': 'afr', 'ak': 'aka', 'sq': 'alb', 'am': 'amh', 'ar': 'ara', - 'an': 'arg', 'hy': 'arm', 'as': 'asm', 'av': 'ava', 'ae': 'ave', 'ay': 'aym', 'az': 'aze', - 'ba': 'bak', 'bm': 'bam', 'eu': 'baq', 'be': 'bel', 'bn': 'ben', 'bh': 'bih', 'bi': 'bis', - 'bs': 'bos', 'br': 'bre', 'bg': 'bul', 'my': 'bur', 'ca': 'cat', 'ch': 'cha', 'ce': 'che', - 'zh': 'chi', 'cu': 'chu', 'cv': 'chv', 'kw': 'cor', 'co': 'cos', 'cr': 'cre', 'cs': 'cze', - 'da': 'dan', 'dv': 'div', 'nl': 'dut', 'dz': 'dzo', 'en': 'eng', 'eo': 'epo', 'et': 'est', - 'ee': 'ewe', 'fo': 'fao', 'fj': 'fij', 'fi': 'fin', 'fr': 'fre', 'fy': 'fry', 'ff': 'ful', - 'ka': 'geo', 'de': 'ger', 'gd': 'gla', 'ga': 'gle', 'gl': 'glg', 'gv': 'glv', 'el': 'ell', - 'gn': 'grn', 'gu': 'guj', 'ht': 'hat', 'ha': 'hau', 'he': 'heb', 'hz': 'her', 'hi': 'hin', - 'ho': 'hmo', 'hr': 'hrv', 'hu': 'hun', 'ig': 'ibo', 'is': 'ice', 'io': 'ido', 'ii': 'iii', - 'iu': 'iku', 'ie': 'ile', 'ia': 'ina', 'id': 'ind', 'ik': 'ipk', 'it': 'ita', 'jv': 'jav', - 'ja': 'jpn', 'kl': 'kal', 'kn': 'kan', 'ks': 'kas', 'kr': 'kau', 'kk': 'kaz', 'km': 'khm', - 'ki': 'kik', 'rw': 'kin', 'ky': 'kir', 'kv': 'kom', 'kg': 'kon', 'ko': 'kor', 'kj': 'kua', - 'ku': 'kur', 'lo': 'lao', 'la': 'lat', 'lv': 'lav', 'li': 'lim', 'ln': 'lin', 'lt': 'lit', - 'lb': 'ltz', 'lu': 'lub', 'lg': 'lug', 'mk': 'mac', 'mh': 'mah', 'ml': 'mal', 'mi': 'mao', - 'mr': 'mar', 'ms': 'may', 'mg': 'mlg', 'mt': 'mlt', 'mo': 'mol', 'mn': 'mon', 'na': 'nau', - 'nv': 'nav', 'nr': 'nbl', 'nd': 'nde', 'ng': 'ndo', 'ne': 'nep', 'nn': 'nno', 'nb': 'nob', - 'no': 'nor', 'ny': 'nya', 'oc': 'oci', 'oj': 'oji', 'or': 'ori', 'om': 'orm', 'os': 'oss', - 'pa': 'pan', 'fa': 'per', 'pi': 'pli', 'pl': 'pol', 'pt': 'por', 'ps': 'pus', 'qu': 'que', - 'rm': 'roh', 'rn': 'run', 'ru': 'rus', 'sg': 'sag', 'sa': 'san', 'sr': 'scc', 'si': 'sin', - 'sk': 'slo', 'sl': 'slv', 'se': 'sme', 'sm': 'smo', 'sn': 'sna', 'sd': 'snd', 'so': 'som', - 'st': 'sot', 'es': 'spa', 'sc': 'srd', 'ss': 'ssw', 'su': 'sun', 'sw': 'swa', 'sv': 'swe', - 'ty': 'tah', 'ta': 'tam', 'tt': 'tat', 'te': 'tel', 'tg': 'tgk', 'tl': 'tgl', 'th': 'tha', - 'bo': 'tib', 'ti': 'tir', 'to': 'ton', 'tn': 'tsn', 'ts': 'tso', 'tk': 'tuk', 'tr': 'tur', - 'tw': 'twi', 'ug': 'uig', 'uk': 'ukr', 'ur': 'urd', 'uz': 'uzb', 've': 'ven', 'vi': 'vie', - 'vo': 'vol', 'cy': 'wel', 'wa': 'wln', 'wo': 'wol', 'xh': 'xho', 'yi': 'yid', 'yo': 'yor', - 'za': 'zha', 'zu': 'zul', 'ro': 'rum', 'po': 'pob', 'un': 'unk', 'ay': 'ass'} - reverted_languages = False + # language list fetched from: + # http://www.opensubtitles.org/addons/export_languages.php + languages = lang_set(['aar', 'abk', 'ace', 'ach', 'ada', 'ady', 'afa', 'afh', + 'afr', 'ain', 'aka', 'akk', 'alb', 'ale', 'alg', 'alt', + 'amh', 'ang', 'apa', 'ara', 'arc', 'arg', 'arm', 'arn', + 'arp', 'art', 'arw', 'asm', 'ast', 'ath', 'aus', 'ava', + 'ave', 'awa', 'aym', 'aze', 'bad', 'bai', 'bak', 'bal', + 'bam', 'ban', 'baq', 'bas', 'bat', 'bej', 'bel', 'bem', + 'ben', 'ber', 'bho', 'bih', 'bik', 'bin', 'bis', 'bla', + 'bnt', 'bod', 'bos', 'bra', 'bre', 'btk', 'bua', 'bug', + 'bul', 'bur', 'byn', 'cad', 'cai', 'car', 'cat', 'cau', + 'ceb', 'cel', 'cha', 'chb', 'che', 'chg', 'chi', 'chk', + 'chm', 'chn', 'cho', 'chp', 'chr', 'chu', 'chv', 'chy', + 'cmc', 'cop', 'cor', 'cos', 'cpe', 'cpf', 'cpp', 'cre', + 'crh', 'crp', 'csb', 'cus', 'cym', 'cze', 'dak', 'dan', + 'dar', 'day', 'del', 'den', 'deu', 'dgr', 'din', 'div', + 'doi', 'dra', 'dua', 'dum', 'dut', 'dyu', 'dzo', 'efi', + 'egy', 'eka', 'elx', 'eng', 'enm', 'epo', 'est', 'eus', + 'ewe', 'ewo', 'fan', 'fao', 'fas', 'fat', 'fij', 'fil', + 'fin', 'fiu', 'fon', 'fra', 'fre', 'frm', 'fro', 'fry', + 'ful', 'fur', 'gaa', 'gay', 'gba', 'gem', 'geo', 'ger', + 'gez', 'gil', 'gla', 'gle', 'glg', 'glv', 'gmh', 'goh', + 'gon', 'gor', 'got', 'grb', 'grc', 'ell', 'grn', 'guj', + 'gwi', 'hai', 'hat', 'hau', 'haw', 'heb', 'her', 'hil', + 'him', 'hin', 'hit', 'hmn', 'hmo', 'hrv', 'hun', 'hup', + 'hye', 'iba', 'ibo', 'ice', 'ido', 'iii', 'ijo', 'iku', + 'ile', 'ilo', 'ina', 'inc', 'ind', 'ine', 'inh', 'ipk', + 'ira', 'iro', 'isl', 'ita', 'jav', 'jpn', 'jpr', 'jrb', + 'kaa', 'kab', 'kac', 'kal', 'kam', 'kan', 'kar', 'kas', + 'kat', 'kau', 'kaw', 'kaz', 'kbd', 'kha', 'khi', 'khm', + 'kho', 'kik', 'kin', 'kir', 'kmb', 'kok', 'kom', 'kon', + 'kor', 'kos', 'kpe', 'krc', 'kro', 'kru', 'kua', 'kum', + 'kur', 'kut', 'lad', 'lah', 'lam', 'lao', 'lat', 'lav', + 'lez', 'lim', 'lin', 'lit', 'lol', 'loz', 'ltz', 'lua', + 'lub', 'lug', 'lui', 'lun', 'luo', 'lus', 'mac', 'mad', + 'mag', 'mah', 'mai', 'mak', 'mal', 'man', 'mao', 'map', + 'mar', 'mas', 'may', 'mdf', 'mdr', 'men', 'mga', 'mic', + 'min', 'mis', 'mkd', 'mkh', 'mlg', 'mlt', 'mnc', 'mni', + 'mno', 'moh', 'mol', 'mon', 'mos', 'mri', 'msa', 'mwl', + 'mul', 'mun', 'mus', 'mwr', 'mya', 'myn', 'myv', 'nah', + 'nai', 'nap', 'nau', 'nav', 'nbl', 'nde', 'ndo', 'nds', + 'nep', 'new', 'nia', 'nic', 'niu', 'nld', 'nno', 'nob', + 'nog', 'non', 'nor', 'nso', 'nub', 'nwc', 'nya', 'nym', + 'nyn', 'nyo', 'nzi', 'oci', 'oji', 'ori', 'orm', 'osa', + 'oss', 'ota', 'oto', 'paa', 'pag', 'pal', 'pam', 'pan', + 'pap', 'pau', 'peo', 'per', 'phi', 'phn', 'pli', 'pol', + 'pon', 'por', 'pra', 'pro', 'pus', 'que', 'raj', 'rap', + 'rar', 'roa', 'roh', 'rom', 'ron', 'run', 'rup', 'rus', + 'sad', 'sag', 'sah', 'sai', 'sal', 'sam', 'san', 'sas', + 'sat', 'scc', 'scn', 'sco', 'scr', 'sel', 'sem', 'sga', + 'sgn', 'shn', 'sid', 'sin', 'sio', 'sit', 'sla', 'slk', + 'slo', 'slv', 'sma', 'sme', 'smi', 'smj', 'smn', 'smo', + 'sms', 'sna', 'snd', 'snk', 'sog', 'som', 'son', 'sot', + 'spa', 'sqi', 'srd', 'srp', 'srr', 'ssa', 'ssw', 'suk', + 'sun', 'sus', 'sux', 'swa', 'swe', 'syr', 'tah', 'tai', + 'tam', 'tat', 'tel', 'tem', 'ter', 'tet', 'tgk', 'tgl', + 'tha', 'tib', 'tig', 'tir', 'tiv', 'tkl', 'tlh', 'tli', + 'tmh', 'tog', 'ton', 'tpi', 'tsi', 'tsn', 'tso', 'tuk', + 'tum', 'tup', 'tur', 'tut', 'tvl', 'twi', 'tyv', 'udm', + 'uga', 'uig', 'ukr', 'umb', 'und', 'urd', 'uzb', 'vai', + 'ven', 'vie', 'vol', 'vot', 'wak', 'wal', 'war', 'was', + 'wel', 'wen', 'wln', 'wol', 'xal', 'xho', 'yao', 'yap', + 'yid', 'yor', 'ypk', 'zap', 'zen', 'zha', 'zho', 'znd', + 'zul', 'zun', 'rum', 'pob', 'unk', 'ass']) + videos = [Episode, Movie] require_video = False confidence_order = ['moviehash', 'imdbid', 'fulltext'] @@ -92,7 +131,7 @@ class OpenSubtitles(ServiceBase): if not searches: raise ServiceError('One or more parameter missing') for search in searches: - search['sublanguageid'] = ','.join([self.get_language(l) for l in languages]) + search['sublanguageid'] = ','.join(l.opensubtitles for l in languages) logger.debug(u'Getting subtitles %r with token %s' % (searches, self.token)) results = self.server.SearchSubtitles(self.token, searches) if not results['data']: @@ -100,7 +139,7 @@ class OpenSubtitles(ServiceBase): return [] subtitles = [] for result in results['data']: - language = self.get_revert_language(result['SubLanguageID']) + language = guessit.Language(result['SubLanguageID']) path = get_subtitle_path(filepath, language, self.config.multi) confidence = 1 - float(self.confidence_order.index(result['MatchedBy'])) / float(len(self.confidence_order)) subtitle = ResultSubtitle(path, language, service=self.__class__.__name__.lower(), link=result['SubDownloadLink'], @@ -108,9 +147,7 @@ class OpenSubtitles(ServiceBase): subtitles.append(subtitle) return subtitles - def list(self, video, languages): - if not self.check_validity(video, languages): - return [] + def list_checked(self, video, languages): results = [] if video.exists: results = self.query(video.path or video.release, languages, moviehash=video.hashes['OpenSubtitles'], size=str(video.size)) diff --git a/libs/subliminal/services/podnapisi.py b/libs/subliminal/services/podnapisi.py new file mode 100755 index 0000000..0e12fa0 --- /dev/null +++ b/libs/subliminal/services/podnapisi.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +# Copyright 2011-2012 Antoine Bertin +# +# This file is part of subliminal. +# +# subliminal is free software; you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# subliminal is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with subliminal. If not, see . +from . import ServiceBase +from ..exceptions import ServiceError, DownloadFailedError +from ..subtitles import get_subtitle_path, ResultSubtitle +from ..utils import to_unicode +from ..videos import Episode, Movie +from guessit.language import lang_set +from hashlib import md5, sha256 +import guessit +import logging +import xmlrpclib + + +logger = logging.getLogger(__name__) + + +class Podnapisi(ServiceBase): + server_url = 'http://ssp.podnapisi.net:8000' + api_based = True + languages = lang_set(['sl', 'en', 'nn', 'ko', 'de', 'is', 'cs', 'fr', 'it', 'bs', 'jp', 'ar', 'ro', + 'hu', 'gr', 'zh', 'lt', 'et', 'lv', 'he', 'nl', 'da', 'sv', 'pl', 'ru', 'es', + 'sq', 'tr', 'fi', 'pt', 'bg', 'mk', 'sr', 'sk', 'hr', 'hi', 'th', 'ca', 'uk', + 'pb', 'ga', 'be', 'vi', 'fa', 'ca', 'id', 'ms']) + #FIXME: ag and cyr not recognized by guessit + videos = [Episode, Movie] + require_video = True + + def __init__(self, config=None): + super(Podnapisi, self).__init__(config) + self.server = xmlrpclib.ServerProxy(self.server_url) + self.token = None + + def init(self): + super(Podnapisi, self).init() + result = self.server.initiate(self.user_agent) + if result['status'] != 200: + raise ServiceError('Initiate failed') + username = 'python_subliminal' + password = sha256(md5('XWFXQ6gE5Oe12rv4qxXX').hexdigest() + result['nonce']).hexdigest() + self.token = result['session'] + result = self.server.authenticate(self.token, username, password) + if result['status'] != 200: + raise ServiceError('Authenticate failed') + + def terminate(self): + super(Podnapisi, self).terminate() + + def query(self, filepath, languages, moviehash): + results = self.server.search(self.token, [moviehash]) + if results['status'] != 200: + logger.error('Search failed with error code %d' % results['status']) + return [] + if not results['results'] or not results['results'][moviehash]['subtitles']: + logger.debug(u'Could not find subtitles for %r with token %s' % (moviehash, self.token)) + return [] + subtitles = [] + for result in results['results'][moviehash]['subtitles']: + language = guessit.Language(result['lang']) + if language == guessit.language.UNDETERMINED or language not in languages: + continue + path = get_subtitle_path(filepath, language, self.config.multi) + subtitle = ResultSubtitle(path, language, service=self.__class__.__name__.lower(), link=result['id'], + release=to_unicode(result['release']), confidence=result['weight']) + subtitles.append(subtitle) + if not subtitles: + return [] + # Convert weight to confidence + max_weight = float(max([s.confidence for s in subtitles])) + min_weight = float(min([s.confidence for s in subtitles])) + for subtitle in subtitles: + if max_weight == 0 and min_weight == 0: + subtitle.confidence = 1.0 + else: + subtitle.confidence = (subtitle.confidence - min_weight) / (max_weight - min_weight) + return subtitles + + def list_checked(self, video, languages): + results = self.query(video.path, languages, video.hashes['OpenSubtitles']) + return results + + def download(self, subtitle): + results = self.server.download(self.token, [subtitle.link]) + if results['status'] != 200: + raise DownloadFailedError() + subtitle.link = 'http://www.podnapisi.net/static/podnapisi/' + results['names'][0]['filename'] + self.download_file(subtitle.link, subtitle.path) + return subtitle + + +Service = Podnapisi diff --git a/libs/subliminal/services/subswiki.py b/libs/subliminal/services/subswiki.py index c5670c1..6671a8d 100755 --- a/libs/subliminal/services/subswiki.py +++ b/libs/subliminal/services/subswiki.py @@ -19,8 +19,10 @@ from . import ServiceBase from ..exceptions import ServiceError from ..subtitles import get_subtitle_path, ResultSubtitle from ..videos import Episode, Movie +from bs4 import BeautifulSoup +from guessit.language import lang_set from subliminal.utils import get_keywords, split_keyword -import BeautifulSoup +import guessit import logging import re import urllib @@ -32,17 +34,15 @@ logger = logging.getLogger(__name__) class SubsWiki(ServiceBase): server_url = 'http://www.subswiki.com' api_based = False - languages = {u'English (US)': 'en', u'English (UK)': 'en', u'English': 'en', u'French': 'fr', u'Brazilian': 'po', - u'Portuguese': 'pt', u'Español (Latinoamérica)': 'es', u'Español (España)': 'es', u'Español': 'es', - u'Italian': 'it', u'Català': 'ca'} - reverted_languages = True + languages = lang_set([u'English (US)', u'English (UK)', u'English', u'French', u'Brazilian', + u'Portuguese', u'Español (Latinoamérica)', u'Español (España)', + u'Español', u'Italian', u'Català'], strict=True) videos = [Episode, Movie] require_video = False release_pattern = re.compile('\nVersion (.+), ([0-9]+).([0-9])+ MBs') + required_features = ['permissive'] - def list(self, video, languages): - if not self.check_validity(video, languages): - return [] + def list_checked(self, video, languages): results = [] if isinstance(video, Episode): results = self.query(video.path or video.release, languages, get_keywords(video.guess), series=video.series, season=video.season, episode=video.episode) @@ -74,7 +74,7 @@ class SubsWiki(ServiceBase): if r.status_code != 200: logger.error(u'Request %s returned status code %d' % (r.url, r.status_code)) return [] - soup = BeautifulSoup.BeautifulSoup(r.content) + soup = BeautifulSoup(r.content, self.required_features) subtitles = [] for sub in soup('td', {'class': 'NewsTitle'}): sub_keywords = split_keyword(self.release_pattern.search(sub.contents[1]).group(1).lower()) @@ -82,8 +82,8 @@ class SubsWiki(ServiceBase): logger.debug(u'None of subtitle keywords %r in %r' % (sub_keywords, keywords)) continue for html_language in sub.parent.parent.findAll('td', {'class': 'language'}): - language = self.get_revert_language(html_language.string.strip()) - if not language in languages: + language = guessit.Language(html_language.string.strip()) + if language not in languages: logger.debug(u'Language %r not in wanted languages %r' % (language, languages)) continue html_status = html_language.findNextSibling('td') @@ -96,4 +96,5 @@ class SubsWiki(ServiceBase): subtitles.append(subtitle) return subtitles + Service = SubsWiki diff --git a/libs/subliminal/services/subtitulos.py b/libs/subliminal/services/subtitulos.py index 44888e7..8154aaa 100755 --- a/libs/subliminal/services/subtitulos.py +++ b/libs/subliminal/services/subtitulos.py @@ -18,8 +18,10 @@ from . import ServiceBase from ..subtitles import get_subtitle_path, ResultSubtitle from ..videos import Episode +from bs4 import BeautifulSoup +from guessit.language import lang_set from subliminal.utils import get_keywords, split_keyword -import BeautifulSoup +import guessit import logging import re import unicodedata @@ -32,19 +34,19 @@ logger = logging.getLogger(__name__) class Subtitulos(ServiceBase): server_url = 'http://www.subtitulos.es' api_based = False - languages = {u'English (US)': 'en', u'English (UK)': 'en', u'English': 'en', u'French': 'fr', u'Brazilian': 'po', - u'Portuguese': 'pt', u'Español (Latinoamérica)': 'es', u'Español (España)': 'es', u'Español': 'es', - u'Italian': 'it', u'Català': 'ca'} - reverted_languages = True + languages = lang_set([u'English (US)', u'English (UK)', u'English', u'French', u'Brazilian', + u'Portuguese', u'Español (Latinoamérica)', u'Español (España)', u'Español', + u'Italian', u'Català'], strict=True) videos = [Episode] require_video = False - release_pattern = re.compile('Versión (.+) ([0-9]+).([0-9])+ megabytes') + required_features = ['permissive'] + # the '.+' in the pattern for Version allows us to match both 'ó' + # and the 'ó' char directly. This is because now BS4 converts the html + # code chars into their equivalent unicode char + release_pattern = re.compile('Versi.+n (.+) ([0-9]+).([0-9])+ megabytes') - def list(self, video, languages): - if not self.check_validity(video, languages): - return [] - results = self.query(video.path or video.release, languages, get_keywords(video.guess), video.series, video.season, video.episode) - return results + def list_checked(self, video, languages): + return self.query(video.path or video.release, languages, get_keywords(video.guess), video.series, video.season, video.episode) def query(self, filepath, languages, keywords, series, season, episode): request_series = series.lower().replace(' ', '_') @@ -58,7 +60,7 @@ class Subtitulos(ServiceBase): if r.status_code != 200: logger.error(u'Request %s returned status code %d' % (r.url, r.status_code)) return [] - soup = BeautifulSoup.BeautifulSoup(r.content) + soup = BeautifulSoup(r.content, self.required_features) subtitles = [] for sub in soup('div', {'id': 'version'}): sub_keywords = split_keyword(self.release_pattern.search(sub.find('p', {'class': 'title-sub'}).contents[1]).group(1).lower()) @@ -66,8 +68,8 @@ class Subtitulos(ServiceBase): logger.debug(u'None of subtitle keywords %r in %r' % (sub_keywords, keywords)) continue for html_language in sub.findAllNext('ul', {'class': 'sslist'}): - language = self.get_revert_language(html_language.findNext('li', {'class': 'li-idioma'}).find('strong').contents[0].string.strip()) - if not language in languages: + language = guessit.Language(html_language.findNext('li', {'class': 'li-idioma'}).find('strong').contents[0].string.strip()) + if language not in languages: logger.debug(u'Language %r not in wanted languages %r' % (language, languages)) continue html_status = html_language.findNext('li', {'class': 'li-estado green'}) @@ -80,4 +82,5 @@ class Subtitulos(ServiceBase): subtitles.append(subtitle) return subtitles + Service = Subtitulos diff --git a/libs/subliminal/services/thesubdb.py b/libs/subliminal/services/thesubdb.py index cccddd4..1800dcf 100755 --- a/libs/subliminal/services/thesubdb.py +++ b/libs/subliminal/services/thesubdb.py @@ -18,6 +18,8 @@ from . import ServiceBase from ..subtitles import get_subtitle_path, ResultSubtitle from ..videos import Episode, Movie, UnknownVideo +from guessit.language import lang_set +import guessit import logging @@ -26,21 +28,17 @@ logger = logging.getLogger(__name__) class TheSubDB(ServiceBase): server_url = 'http://api.thesubdb.com/' # for testing purpose, use http://sandbox.thesubdb.com/ instead - user_agent = 'SubDB/1.0 (subliminal/0.5; https://github.com/Diaoul/subliminal)' # defined by the API + user_agent = 'SubDB/1.0 (subliminal/0.6; https://github.com/Diaoul/subliminal)' # defined by the API api_based = True - languages = {'af': 'af', 'cs': 'cs', 'da': 'da', 'de': 'de', 'en': 'en', 'es': 'es', 'fi': 'fi', - 'fr': 'fr', 'hu': 'hu', 'id': 'id', 'it': 'it', 'la': 'la', 'nl': 'nl', 'no': 'no', - 'oc': 'oc', 'pl': 'pl', 'pt': 'pt', 'ro': 'ro', 'ru': 'ru', 'sl': 'sl', 'sr': 'sr', - 'sv': 'sv', 'tr': 'tr'} # list available with the API at http://sandbox.thesubdb.com/?action=languages - reverted_languages = False + languages = lang_set(['af', 'cs', 'da', 'de', 'en', 'es', 'fi', + 'fr', 'hu', 'id', 'it', 'la', 'nl', 'no', + 'oc', 'pl', 'pt', 'ro', 'ru', 'sl', 'sr', + 'sv', 'tr'], strict=True) # list available with the API at http://sandbox.thesubdb.com/?action=languages videos = [Movie, Episode, UnknownVideo] require_video = True - def list(self, video, languages): - if not self.check_validity(video, languages): - return [] - results = self.query(video.path, video.hashes['TheSubDB'], languages) - return results + def list_checked(self, video, languages): + return self.query(video.path, video.hashes['TheSubDB'], languages) def query(self, filepath, moviehash, languages): r = self.session.get(self.server_url, params={'action': 'search', 'hash': moviehash}) @@ -50,7 +48,7 @@ class TheSubDB(ServiceBase): if r.status_code != 200: logger.error(u'Request %s returned status code %d' % (r.url, r.status_code)) return [] - available_languages = set([self.get_revert_language(l) for l in r.content.split(',')]) + available_languages = set(guessit.Language(l) for l in r.content.split(',')) languages &= available_languages if not languages: logger.debug(u'Could not find subtitles for hash %s with languages %r (only %r available)' % (moviehash, languages, available_languages)) @@ -58,8 +56,9 @@ class TheSubDB(ServiceBase): subtitles = [] for language in languages: path = get_subtitle_path(filepath, language, self.config.multi) - subtitle = ResultSubtitle(path, language, service=self.__class__.__name__.lower(), link='%s?action=download&hash=%s&language=%s' % (self.server_url, moviehash, self.get_language(language))) + subtitle = ResultSubtitle(path, language, self.__class__.__name__.lower(), '%s?action=download&hash=%s&language=%s' % (self.server_url, moviehash, language.alpha2)) subtitles.append(subtitle) return subtitles + Service = TheSubDB diff --git a/libs/subliminal/services/tvsubtitles.py b/libs/subliminal/services/tvsubtitles.py new file mode 100755 index 0000000..5c74cc1 --- /dev/null +++ b/libs/subliminal/services/tvsubtitles.py @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- +# Copyright 2012 Nicolas Wack +# +# This file is part of subliminal. +# +# subliminal is free software; you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# subliminal is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with subliminal. If not, see . +from . import ServiceBase +from ..cache import cachedmethod +from ..subtitles import get_subtitle_path, ResultSubtitle +from ..videos import Episode +from bs4 import BeautifulSoup +from guessit.language import lang_set +from subliminal.utils import get_keywords +import guessit +import logging +import re + + +logger = logging.getLogger(__name__) + + +def match(pattern, string): + try: + return re.search(pattern, string).group(1) + except AttributeError: + logger.debug(u'Could not match %r on %r' % (pattern, string)) + return None + + +class TvSubtitles(ServiceBase): + server_url = 'http://www.tvsubtitles.net' + api_based = False + languages = lang_set([u'English', u'Español', u'French', u'German', + u'Brazilian', u'Russian', u'Ukrainian', u'Italian', + u'Greek', u'Arabic', u'Hungarian', u'Polish', + u'Turkish', u'Dutch', u'Portuguese', u'Swedish', + u'Danish', u'Finnish', u'Korean', u'Chinese', + u'Japanese', u'Bulgarian', u'Czech', u'Romanian'], strict=True) + videos = [Episode] + require_video = False + required_features = ['permissive'] + + @cachedmethod + def get_likely_series_id(self, name): + r = self.session.post('%s/search.php' % self.server_url, data={'q': name}) + soup = BeautifulSoup(r.content, self.required_features) + maindiv = soup.find('div', 'left') + results = [] + for elem in maindiv.find_all('li'): + sid = int(match('tvshow-([0-9]+)\.html', elem.a['href'])) + show_name = match('(.*) \(', elem.a.text) + results.append((show_name, sid)) + #TODO: pick up the best one in a smart way + result = results[0] + return result[1] + + @cachedmethod + def get_episode_id(self, series_id, season, number): + """Get the TvSubtitles id for the given episode. Raises KeyError if none + could be found.""" + # download the page of the season, contains ids for all episodes + episode_id = None + r = self.session.get('%s/tvshow-%d-%d.html' % (self.server_url, series_id, season)) + soup = BeautifulSoup(r.content, self.required_features) + table = soup.find('table', id='table5') + for row in table.find_all('tr'): + cells = row.find_all('td') + if not cells: + continue + + episode_number = match('x([0-9]+)', cells[0].text) + if not episode_number: + continue + + episode_number = int(episode_number) + episode_id = int(match('episode-([0-9]+)', cells[1].a['href'])) + # we could just return the id of the queried episode, but as we + # already downloaded the whole page we might as well fill in the + # information for all the episodes of the season + self.cache_for(self.get_episode_id, args=(series_id, season, episode_number), result=episode_id) + # raises KeyError if not found + return self.cached_value(self.get_episode_id, args=(series_id, season, number)) + + # Do not cache this method in order to always check for the most recent + # subtitles + def get_sub_ids(self, episode_id): + subids = [] + r = self.session.get('%s/episode-%d.html' % (self.server_url, episode_id)) + epsoup = BeautifulSoup(r.content, self.required_features) + for subdiv in epsoup.find_all('a'): + if 'href' not in subdiv.attrs or not subdiv['href'].startswith('/subtitle'): + continue + subid = int(match('([0-9]+)', subdiv['href'])) + lang = guessit.Language(match('flags/(.*).gif', subdiv.img['src'])) + result = {'subid': subid, 'language': lang} + for p in subdiv.find_all('p'): + if 'alt' in p.attrs and p['alt'] == 'rip': + result['rip'] = p.text.strip() + if 'alt' in p.attrs and p['alt'] == 'release': + result['release'] = p.text.strip() + + subids.append(result) + return subids + + def list_checked(self, video, languages): + return self.query(video.path or video.release, languages, get_keywords(video.guess), video.series, video.season, video.episode) + + def query(self, filepath, languages, keywords, series, season, episode): + logger.debug(u'Getting subtitles for %s season %d episode %d with languages %r' % (series, season, episode, languages)) + self.init_cache() + sid = self.get_likely_series_id(series.lower()) + try: + ep_id = self.get_episode_id(sid, season, episode) + except KeyError: + logger.debug(u'Could not find episode id for %s season %d episode %d' % (series, season, episode)) + return [] + subids = self.get_sub_ids(ep_id) + # filter the subtitles with our queried languages + subtitles = [] + for subid in subids: + language = subid['language'] + if language not in languages: + continue + path = get_subtitle_path(filepath, language, self.config.multi) + subtitle = ResultSubtitle(path, language, self.__class__.__name__.lower(), + '%s/download-%d.html' % (self.server_url, subid['subid']), + keywords=[subid['rip'], subid['release']]) + subtitles.append(subtitle) + return subtitles + + def download(self, subtitle): + self.download_zip_file(subtitle.link, subtitle.path) + + +Service = TvSubtitles diff --git a/libs/subliminal/subtitles.py b/libs/subliminal/subtitles.py index 355046d..a33c35c 100755 --- a/libs/subliminal/subtitles.py +++ b/libs/subliminal/subtitles.py @@ -15,13 +15,13 @@ # # You should have received a copy of the GNU Lesser General Public License # along with subliminal. If not, see . -from .languages import list_languages, convert_language import os.path +import guessit +from guessit.language import is_language __all__ = ['Subtitle', 'EmbeddedSubtitle', 'ExternalSubtitle', 'ResultSubtitle', 'get_subtitle_path'] - #: Subtitles extensions EXTENSIONS = ['.srt', '.sub', '.txt'] @@ -30,7 +30,8 @@ class Subtitle(object): """Base class for subtitles :param string path: path to the subtitle - :param string language: language of the subtitle (second element of :class:`~subliminal.languages.LANGUAGES`) + :param language: language of the subtitle + :type language: :class:`guessit.Language` """ def __init__(self, path, language): @@ -49,7 +50,8 @@ class EmbeddedSubtitle(Subtitle): """Subtitle embedded in a container :param string path: path to the subtitle - :param string language: language of the subtitle (second element of :class:`~subliminal.languages.LANGUAGES`) + :param language: language of the subtitle + :type language: :class:`guessit.Language` :param int track_id: id of the subtitle track in the container """ @@ -59,7 +61,7 @@ class EmbeddedSubtitle(Subtitle): @classmethod def from_enzyme(cls, path, subtitle): - language = convert_language(subtitle.language, 1, 2) + language = guessit.Language(subtitle.language) or None return cls(path, language, subtitle.trackno) @@ -76,8 +78,8 @@ class ExternalSubtitle(Subtitle): if not extension: raise ValueError('Not a supported subtitle extension') language = os.path.splitext(path[:len(path) - len(extension)])[1][1:] - if not language in list_languages(1): - language = None + language = guessit.Language(language) or None + return cls(path, language) @@ -85,7 +87,8 @@ class ResultSubtitle(ExternalSubtitle): """Subtitle found using :mod:`~subliminal.services` :param string path: path to the subtitle - :param string language: language of the subtitle (second element of :class:`~subliminal.languages.LANGUAGES`) + :param language: language of the subtitle + :type language: :class:`guessit.Language` :param string service: name of the service :param string link: download link for the subtitle :param string release: release name of the video @@ -111,20 +114,27 @@ class ResultSubtitle(ExternalSubtitle): """ extension = os.path.splitext(self.path)[0] language = os.path.splitext(self.path[:len(self.path) - len(extension)])[1][1:] - if not language in list_languages(1): - return True - return False + return not is_language(language) def __repr__(self): return 'ResultSubtitle(%s, %s, %.2f, %s)' % (self.language, self.service, self.confidence, self.release) def get_subtitle_path(video_path, language, multi): - """Create the subtitle path from the given video path using language if multi""" + """Create the subtitle path from the given video path using language if multi + + :param string video_path: path to the video + :param language: language of the subtitle + :type language: :class:`guessit.Language` + :param bool multi: whether to use multi language naming or not + :return: path of the subtitle + :rtype: string + + """ if not os.path.exists(video_path): path = os.path.splitext(os.path.basename(video_path))[0] else: path = os.path.splitext(video_path)[0] if multi and language: - return path + '.%s%s' % (language, EXTENSIONS[0]) + return path + '.%s%s' % (language.alpha2, EXTENSIONS[0]) return path + '%s' % EXTENSIONS[0] diff --git a/libs/subliminal/videos.py b/libs/subliminal/videos.py index 79276b1..adfcafe 100755 --- a/libs/subliminal/videos.py +++ b/libs/subliminal/videos.py @@ -16,7 +16,6 @@ # You should have received a copy of the GNU Lesser General Public License # along with subliminal. If not, see . from . import subtitles -from .languages import list_languages import enzyme import guessit import hashlib @@ -130,14 +129,23 @@ class Video(object): logger.debug(u'Failed parsing %s with enzyme' % self.path) if isinstance(video_infos, enzyme.core.AVContainer): results.extend([subtitles.EmbeddedSubtitle.from_enzyme(self.path, s) for s in video_infos.subtitles]) - for l in list_languages(1): - for e in subtitles.EXTENSIONS: - single_path = basepath + '%s' % e - if os.path.exists(single_path): - results.append(subtitles.ExternalSubtitle(single_path, None)) - multi_path = basepath + '.%s%s' % (l, e) - if os.path.exists(multi_path): - results.append(subtitles.ExternalSubtitle(multi_path, l)) + + # cannot use glob here because it chokes if there are any square + # brackets inside the filename, so we have to use basic string + # startswith/endswith comparisons + folder, basename = os.path.split(basepath) + existing = [f for f in os.listdir(folder) if f.startswith(basename)] + for path in existing: + for ext in subtitles.EXTENSIONS: + if path.endswith(ext): + possible_lang = path[len(basename) + 1:-len(ext)] + if possible_lang == '': + results.append(subtitles.ExternalSubtitle(path, None)) + else: + lang = guessit.Language(possible_lang) + if lang: + results.append(subtitles.ExternalSubtitle(path, lang)) + return results def __repr__(self): @@ -189,11 +197,12 @@ class UnknownVideo(Video): pass -def scan(entry, max_depth=3, depth=0): +def scan(entry, max_depth=3, scan_filter=None, depth=0): """Scan a path for videos and subtitles :param string entry: path :param int max_depth: maximum folder depth + :param function scan_filter: filter function that takes a path as argument and returns a boolean indicating whether it has to be filtered out (``True``) or not (``False``) :param int depth: starting depth :return: found videos and subtitles :rtype: list of (:class:`Video`, [:class:`~subliminal.subtitles.Subtitle`]) @@ -207,13 +216,15 @@ def scan(entry, max_depth=3, depth=0): logger.debug(u'Scanning directory %s with depth %d/%d' % (entry, depth, max_depth)) result = [] for e in os.listdir(entry): - result.extend(scan(os.path.join(entry, e), max_depth, depth + 1)) + result.extend(scan(os.path.join(entry, e), max_depth, scan_filter, depth + 1)) return result if os.path.isfile(entry) or depth == 0: logger.debug(u'Scanning file %s with depth %d/%d' % (entry, depth, max_depth)) if depth != 0: # trust the user: only check for valid format if recursing if mimetypes.guess_type(entry)[0] not in MIMETYPES and os.path.splitext(entry)[1] not in EXTENSIONS: return [] + if scan_filter is not None and scan_filter(entry): + return [] video = Video.from_path(entry) return [(video, video.scan())] logger.warning(u'Scanning entry %s failed with depth %d/%d' % (entry, depth, max_depth))