You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
142 lines
5.7 KiB
142 lines
5.7 KiB
"""
|
|
articles module (imdb package).
|
|
|
|
This module provides functions and data to handle in a smart way
|
|
articles (in various languages) at the beginning of movie titles.
|
|
|
|
Copyright 2009 Davide Alberani <da@erlug.linux.it>
|
|
2009 H. Turgut Uyar <uyar@tekir.org>
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
"""
|
|
|
|
# List of generic articles used when the language of the title is unknown (or
|
|
# we don't have information about articles in that language).
|
|
# XXX: Managing titles in a lot of different languages, a function to recognize
|
|
# an initial article can't be perfect; sometimes we'll stumble upon a short
|
|
# word that is an article in some language, but it's not in another; in these
|
|
# situations we have to choose if we want to interpret this little word
|
|
# as an article or not (remember that we don't know what the original language
|
|
# of the title was).
|
|
# Example: 'en' is (I suppose) an article in Some Language. Unfortunately it
|
|
# seems also to be a preposition in other languages (French?).
|
|
# Running a script over the whole list of titles (and aliases), I've found
|
|
# that 'en' is used as an article only 376 times, and as another thing 594
|
|
# times, so I've decided to _always_ consider 'en' as a non article.
|
|
#
|
|
# Here is a list of words that are _never_ considered as articles, complete
|
|
# with the cound of times they are used in a way or another:
|
|
# 'en' (376 vs 594), 'to' (399 vs 727), 'as' (198 vs 276), 'et' (79 vs 99),
|
|
# 'des' (75 vs 150), 'al' (78 vs 304), 'ye' (14 vs 70),
|
|
# 'da' (23 vs 298), "'n" (8 vs 12)
|
|
#
|
|
# I've left in the list 'i' (1939 vs 2151) and 'uno' (52 vs 56)
|
|
# I'm not sure what '-al' is, and so I've left it out...
|
|
#
|
|
# Generic list of articles in utf-8 encoding:
|
|
GENERIC_ARTICLES = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
|
|
"l'", 'il', 'das', 'les', 'i', 'o', 'ein', 'un', 'de', 'los',
|
|
'an', 'una', 'las', 'eine', 'den', 'het', 'gli', 'lo', 'os',
|
|
'ang', 'oi', 'az', 'een', 'ha-', 'det', 'ta', 'al-',
|
|
'mga', "un'", 'uno', 'ett', 'dem', 'egy', 'els', 'eines',
|
|
'\xc3\x8f', '\xc3\x87', '\xc3\x94\xc3\xaf', '\xc3\x8f\xc3\xa9')
|
|
|
|
|
|
# Lists of articles separated by language. If possible, the list should
|
|
# be sorted by frequency (not very important, but...)
|
|
# If you want to add a list of articles for another language, mail it
|
|
# it at imdbpy-devel@lists.sourceforge.net; non-ascii articles must be utf-8
|
|
# encoded.
|
|
LANG_ARTICLES = {
|
|
'English': ('the', 'a', 'an'),
|
|
'Italian': ('la', 'le', "l'", 'il', 'i', 'un', 'una', 'gli', 'lo', "un'",
|
|
'uno'),
|
|
'Spanish': ('la', 'le', 'el', 'les', 'un', 'los', 'una', 'uno', 'unos',
|
|
'unas'),
|
|
'Portuguese': ('a', 'as', 'o', 'os', 'um', 'uns', 'uma', 'umas'),
|
|
'Turkish': (), # Some languages doesn't have articles.
|
|
}
|
|
LANG_ARTICLESget = LANG_ARTICLES.get
|
|
|
|
|
|
# Maps a language to countries where it is the main language.
|
|
# If you want to add an entry for another language or country, mail it at
|
|
# imdbpy-devel@lists.sourceforge.net .
|
|
_LANG_COUNTRIES = {
|
|
'English': ('USA', 'UK', 'Canada', 'Ireland', 'Australia'),
|
|
'Italian': ('Italy',),
|
|
'Spanish': ('Spain', 'Mexico'),
|
|
'Portuguese': ('Portugal', 'Brazil'),
|
|
'Turkish': ('Turkey',),
|
|
#'German': ('Germany', 'East Germany', 'West Germany'),
|
|
#'French': ('France'),
|
|
}
|
|
|
|
# Maps countries to their main language.
|
|
COUNTRY_LANG = {}
|
|
for lang in _LANG_COUNTRIES:
|
|
for country in _LANG_COUNTRIES[lang]:
|
|
COUNTRY_LANG[country] = lang
|
|
|
|
|
|
def toUnicode(articles):
|
|
"""Convert a list of articles utf-8 encoded to unicode strings."""
|
|
return tuple([art.decode('utf_8') for art in articles])
|
|
|
|
|
|
def toDicts(articles):
|
|
"""Given a list of utf-8 encoded articles, build two dictionary (one
|
|
utf-8 encoded and another one with unicode keys) for faster matches."""
|
|
uArticles = toUnicode(articles)
|
|
return dict([(x, x) for x in articles]), dict([(x, x) for x in uArticles])
|
|
|
|
|
|
def addTrailingSpace(articles):
|
|
"""From the given list of utf-8 encoded articles, return two
|
|
lists (one utf-8 encoded and another one in unicode) where a space
|
|
is added at the end - if the last char is not ' or -."""
|
|
_spArticles = []
|
|
_spUnicodeArticles = []
|
|
for article in articles:
|
|
if article[-1] not in ("'", '-'):
|
|
article += ' '
|
|
_spArticles.append(article)
|
|
_spUnicodeArticles.append(article.decode('utf_8'))
|
|
return _spArticles, _spUnicodeArticles
|
|
|
|
|
|
# Caches.
|
|
_ART_CACHE = {}
|
|
_SP_ART_CACHE = {}
|
|
|
|
def articlesDictsForLang(lang):
|
|
"""Return dictionaries of articles specific for the given language, or the
|
|
default one if the language is not known."""
|
|
if lang in _ART_CACHE:
|
|
return _ART_CACHE[lang]
|
|
artDicts = toDicts(LANG_ARTICLESget(lang, GENERIC_ARTICLES))
|
|
_ART_CACHE[lang] = artDicts
|
|
return artDicts
|
|
|
|
|
|
def spArticlesForLang(lang):
|
|
"""Return lists of articles (plus optional spaces) specific for the
|
|
given language, or the default one if the language is not known."""
|
|
if lang in _SP_ART_CACHE:
|
|
return _SP_ART_CACHE[lang]
|
|
spArticles = addTrailingSpace(LANG_ARTICLESget(lang, GENERIC_ARTICLES))
|
|
_SP_ART_CACHE[lang] = spArticles
|
|
return spArticles
|
|
|
|
|