CouchPotatoServer/libs/imdb/articles.py


								"""

								articles  module (imdb package).


								This module provides functions and data to handle in a smart way

								articles (in various languages) at the beginning of movie titles.


								Copyright 2009 Davide Alberani <da@erlug.linux.it>

								          2009 H. Turgut Uyar <uyar@tekir.org>


								This program is free software; you can redistribute it and/or modify

								it under the terms of the GNU General Public License as published by

								the Free Software Foundation; either version 2 of the License, or

								(at your option) any later version.


								This program is distributed in the hope that it will be useful,

								but WITHOUT ANY WARRANTY; without even the implied warranty of

								MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

								GNU General Public License for more details.


								You should have received a copy of the GNU General Public License

								along with this program; if not, write to the Free Software

								Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

								"""


								# List of generic articles used when the language of the title is unknown (or

								# we don't have information about articles in that language).

								# XXX: Managing titles in a lot of different languages, a function to recognize

								# an initial article can't be perfect; sometimes we'll stumble upon a short

								# word that is an article in some language, but it's not in another; in these

								# situations we have to choose if we want to interpret this little word

								# as an article or not (remember that we don't know what the original language

								# of the title was).

								# Example: 'en' is (I suppose) an article in Some Language.  Unfortunately it

								# seems also to be a preposition in other languages (French?).

								# Running a script over the whole list of titles (and aliases), I've found

								# that 'en' is used as an article only 376 times, and as another thing 594

								# times, so I've decided to _always_ consider 'en' as a non article.

								#

								# Here is a list of words that are _never_ considered as articles, complete

								# with the cound of times they are used in a way or another:

								# 'en' (376 vs 594), 'to' (399 vs 727), 'as' (198 vs 276), 'et' (79 vs 99),

								# 'des' (75 vs 150), 'al' (78 vs 304), 'ye' (14 vs 70),

								# 'da' (23 vs 298), "'n" (8 vs 12)

								#

								# I've left in the list 'i' (1939 vs 2151) and 'uno' (52 vs 56)

								# I'm not sure what '-al' is, and so I've left it out...

								#

								# Generic list of articles in utf-8 encoding:

								GENERIC_ARTICLES = ('the', 'la', 'a', 'die', 'der', 'le', 'el',

								            "l'", 'il', 'das', 'les', 'i', 'o', 'ein', 'un', 'de', 'los',

								            'an', 'una', 'las', 'eine', 'den', 'het', 'gli', 'lo', 'os',

								            'ang', 'oi', 'az', 'een', 'ha-', 'det', 'ta', 'al-',

								            'mga', "un'", 'uno', 'ett', 'dem', 'egy', 'els', 'eines',

								            '\xc3\x8f', '\xc3\x87', '\xc3\x94\xc3\xaf', '\xc3\x8f\xc3\xa9')


								# Lists of articles separated by language.  If possible, the list should

								# be sorted by frequency (not very important, but...)

								# If you want to add a list of articles for another language, mail it

								# it at imdbpy-devel@lists.sourceforge.net; non-ascii articles must be utf-8

								# encoded.

								LANG_ARTICLES = {

								    'English': ('the', 'a', 'an'),

								    'Italian': ('la', 'le', "l'", 'il', 'i', 'un', 'una', 'gli', 'lo', "un'",

								                'uno'),

								    'Spanish': ('la', 'le', 'el', 'les', 'un', 'los', 'una', 'uno', 'unos',

								                'unas'),

								    'Portuguese': ('a', 'as', 'o', 'os', 'um', 'uns', 'uma', 'umas'),

								    'Turkish': (), # Some languages doesn't have articles.

								}

								LANG_ARTICLESget = LANG_ARTICLES.get


								# Maps a language to countries where it is the main language.

								# If you want to add an entry for another language or country, mail it at

								# imdbpy-devel@lists.sourceforge.net .

								_LANG_COUNTRIES = {

								    'English': ('USA', 'UK', 'Canada', 'Ireland', 'Australia'),

								    'Italian': ('Italy',),

								    'Spanish': ('Spain', 'Mexico'),

								    'Portuguese': ('Portugal', 'Brazil'),

								    'Turkish': ('Turkey',),

								    #'German': ('Germany', 'East Germany', 'West Germany'),

								    #'French': ('France'),

								}


								# Maps countries to their main language.

								COUNTRY_LANG = {}

								for lang in _LANG_COUNTRIES:

								    for country in _LANG_COUNTRIES[lang]:

								        COUNTRY_LANG[country] = lang


								def toUnicode(articles):

								    """Convert a list of articles utf-8 encoded to unicode strings."""

								    return tuple([art.decode('utf_8') for art in articles])


								def toDicts(articles):

								    """Given a list of utf-8 encoded articles, build two dictionary (one

								    utf-8 encoded and another one with unicode keys) for faster matches."""

								    uArticles = toUnicode(articles)

								    return dict([(x, x) for x in articles]), dict([(x, x) for x in uArticles])


								def addTrailingSpace(articles):

								    """From the given list of utf-8 encoded articles, return two

								    lists (one utf-8 encoded and another one in unicode) where a space

								    is added at the end - if the last char is not ' or -."""

								    _spArticles = []

								    _spUnicodeArticles = []

								    for article in articles:

								        if article[-1] not in ("'", '-'):

								            article += ' '

								        _spArticles.append(article)

								        _spUnicodeArticles.append(article.decode('utf_8'))

								    return _spArticles, _spUnicodeArticles


								# Caches.

								_ART_CACHE = {}

								_SP_ART_CACHE = {}


								def articlesDictsForLang(lang):

								    """Return dictionaries of articles specific for the given language, or the

								    default one if the language is not known."""

								    if lang in _ART_CACHE:

								        return _ART_CACHE[lang]

								    artDicts = toDicts(LANG_ARTICLESget(lang, GENERIC_ARTICLES))

								    _ART_CACHE[lang] = artDicts

								    return artDicts


								def spArticlesForLang(lang):

								    """Return lists of articles (plus optional spaces) specific for the

								    given language, or the default one if the language is not known."""

								    if lang in _SP_ART_CACHE:

								        return _SP_ART_CACHE[lang]

								    spArticles = addTrailingSpace(LANG_ARTICLESget(lang, GENERIC_ARTICLES))

								    _SP_ART_CACHE[lang] = spArticles

								    return spArticles