""" articles module (imdb package). This module provides functions and data to handle in a smart way articles (in various languages) at the beginning of movie titles. Copyright 2009 Davide Alberani 2009 H. Turgut Uyar This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ # List of generic articles used when the language of the title is unknown (or # we don't have information about articles in that language). # XXX: Managing titles in a lot of different languages, a function to recognize # an initial article can't be perfect; sometimes we'll stumble upon a short # word that is an article in some language, but it's not in another; in these # situations we have to choose if we want to interpret this little word # as an article or not (remember that we don't know what the original language # of the title was). # Example: 'en' is (I suppose) an article in Some Language. Unfortunately it # seems also to be a preposition in other languages (French?). # Running a script over the whole list of titles (and aliases), I've found # that 'en' is used as an article only 376 times, and as another thing 594 # times, so I've decided to _always_ consider 'en' as a non article. # # Here is a list of words that are _never_ considered as articles, complete # with the cound of times they are used in a way or another: # 'en' (376 vs 594), 'to' (399 vs 727), 'as' (198 vs 276), 'et' (79 vs 99), # 'des' (75 vs 150), 'al' (78 vs 304), 'ye' (14 vs 70), # 'da' (23 vs 298), "'n" (8 vs 12) # # I've left in the list 'i' (1939 vs 2151) and 'uno' (52 vs 56) # I'm not sure what '-al' is, and so I've left it out... # # Generic list of articles in utf-8 encoding: GENERIC_ARTICLES = ('the', 'la', 'a', 'die', 'der', 'le', 'el', "l'", 'il', 'das', 'les', 'i', 'o', 'ein', 'un', 'de', 'los', 'an', 'una', 'las', 'eine', 'den', 'het', 'gli', 'lo', 'os', 'ang', 'oi', 'az', 'een', 'ha-', 'det', 'ta', 'al-', 'mga', "un'", 'uno', 'ett', 'dem', 'egy', 'els', 'eines', '\xc3\x8f', '\xc3\x87', '\xc3\x94\xc3\xaf', '\xc3\x8f\xc3\xa9') # Lists of articles separated by language. If possible, the list should # be sorted by frequency (not very important, but...) # If you want to add a list of articles for another language, mail it # it at imdbpy-devel@lists.sourceforge.net; non-ascii articles must be utf-8 # encoded. LANG_ARTICLES = { 'English': ('the', 'a', 'an'), 'Italian': ('la', 'le', "l'", 'il', 'i', 'un', 'una', 'gli', 'lo', "un'", 'uno'), 'Spanish': ('la', 'le', 'el', 'les', 'un', 'los', 'una', 'uno', 'unos', 'unas'), 'Portuguese': ('a', 'as', 'o', 'os', 'um', 'uns', 'uma', 'umas'), 'Turkish': (), # Some languages doesn't have articles. } LANG_ARTICLESget = LANG_ARTICLES.get # Maps a language to countries where it is the main language. # If you want to add an entry for another language or country, mail it at # imdbpy-devel@lists.sourceforge.net . _LANG_COUNTRIES = { 'English': ('USA', 'UK', 'Canada', 'Ireland', 'Australia'), 'Italian': ('Italy',), 'Spanish': ('Spain', 'Mexico'), 'Portuguese': ('Portugal', 'Brazil'), 'Turkish': ('Turkey',), #'German': ('Germany', 'East Germany', 'West Germany'), #'French': ('France'), } # Maps countries to their main language. COUNTRY_LANG = {} for lang in _LANG_COUNTRIES: for country in _LANG_COUNTRIES[lang]: COUNTRY_LANG[country] = lang def toUnicode(articles): """Convert a list of articles utf-8 encoded to unicode strings.""" return tuple([art.decode('utf_8') for art in articles]) def toDicts(articles): """Given a list of utf-8 encoded articles, build two dictionary (one utf-8 encoded and another one with unicode keys) for faster matches.""" uArticles = toUnicode(articles) return dict([(x, x) for x in articles]), dict([(x, x) for x in uArticles]) def addTrailingSpace(articles): """From the given list of utf-8 encoded articles, return two lists (one utf-8 encoded and another one in unicode) where a space is added at the end - if the last char is not ' or -.""" _spArticles = [] _spUnicodeArticles = [] for article in articles: if article[-1] not in ("'", '-'): article += ' ' _spArticles.append(article) _spUnicodeArticles.append(article.decode('utf_8')) return _spArticles, _spUnicodeArticles # Caches. _ART_CACHE = {} _SP_ART_CACHE = {} def articlesDictsForLang(lang): """Return dictionaries of articles specific for the given language, or the default one if the language is not known.""" if lang in _ART_CACHE: return _ART_CACHE[lang] artDicts = toDicts(LANG_ARTICLESget(lang, GENERIC_ARTICLES)) _ART_CACHE[lang] = artDicts return artDicts def spArticlesForLang(lang): """Return lists of articles (plus optional spaces) specific for the given language, or the default one if the language is not known.""" if lang in _SP_ART_CACHE: return _SP_ART_CACHE[lang] spArticles = addTrailingSpace(LANG_ARTICLESget(lang, GENERIC_ARTICLES)) _SP_ART_CACHE[lang] = spArticles return spArticles