|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
#
|
|
|
|
# GuessIt - A library for guessing information from filenames
|
|
|
|
# Copyright (c) 2011 Nicolas Wack <wackou@gmail.com>
|
|
|
|
#
|
|
|
|
# GuessIt is free software; you can redistribute it and/or modify it under
|
|
|
|
# the terms of the Lesser GNU General Public License as published by
|
|
|
|
# the Free Software Foundation; either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# GuessIt is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# Lesser GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the Lesser GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
|
|
|
|
from guessit import fileutils
|
|
|
|
import os.path
|
|
|
|
import re
|
|
|
|
import logging
|
|
|
|
|
|
|
|
log = logging.getLogger('guessit.language')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# downloaded from http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
|
|
|
|
#
|
|
|
|
# Description of the fields:
|
|
|
|
# "An alpha-3 (bibliographic) code, an alpha-3 (terminologic) code (when given),
|
|
|
|
# an alpha-2 code (when given), an English name, and a French name of a language
|
|
|
|
# are all separated by pipe (|) characters."
|
|
|
|
language_matrix = [ l.strip().decode('utf-8').split('|')
|
|
|
|
for l in fileutils.load_file_in_same_dir(__file__, 'ISO-639-2_utf-8.txt').split('\n') ]
|
|
|
|
|
|
|
|
lng3 = frozenset(filter(bool, (l[0] for l in language_matrix)))
|
|
|
|
lng3term = frozenset(filter(bool, (l[1] for l in language_matrix)))
|
|
|
|
lng2 = frozenset(filter(bool, (l[2] for l in language_matrix)))
|
|
|
|
lng_en_name = frozenset(filter(bool, (lng for l in language_matrix for lng in l[3].lower().split('; '))))
|
|
|
|
lng_fr_name = frozenset(filter(bool, (lng for l in language_matrix for lng in l[4].lower().split('; '))))
|
|
|
|
lng_all_names = lng3 | lng3term | lng2 | lng_en_name | lng_fr_name
|
|
|
|
|
|
|
|
lng3_to_lng3term = dict((l[0], l[1]) for l in language_matrix if l[1])
|
|
|
|
lng3term_to_lng3 = dict((l[1], l[0]) for l in language_matrix if l[1])
|
|
|
|
|
|
|
|
lng3_to_lng2 = dict((l[0], l[2]) for l in language_matrix if l[2])
|
|
|
|
lng2_to_lng3 = dict((l[2], l[0]) for l in language_matrix if l[2])
|
|
|
|
|
|
|
|
# we only return the first given english name, hoping it is the most used one
|
|
|
|
lng3_to_lng_en_name = dict((l[0], l[3].split('; ')[0]) for l in language_matrix if l[3])
|
|
|
|
lng_en_name_to_lng3 = dict((en_name.lower(), l[0]) for l in language_matrix if l[3] for en_name in l[3].split('; '))
|
|
|
|
|
|
|
|
# we only return the first given french name, hoping it is the most used one
|
|
|
|
lng3_to_lng_fr_name = dict((l[0], l[4].split('; ')[0]) for l in language_matrix if l[4])
|
|
|
|
lng_fr_name_to_lng3 = dict((fr_name.lower(), l[0]) for l in language_matrix if l[4] for fr_name in l[4].split('; '))
|
|
|
|
|
|
|
|
|
|
|
|
def is_language(language):
|
|
|
|
return language.lower() in lng_all_names
|
|
|
|
|
|
|
|
class Language(object):
|
|
|
|
"""This class represents a human language.
|
|
|
|
|
|
|
|
You can initialize it with pretty much everything, as it knows conversion from
|
|
|
|
ISO-639 2-letter and 3-letter codes, English and French names.
|
|
|
|
|
|
|
|
>>> Language('fr')
|
|
|
|
Language(French)
|
|
|
|
|
|
|
|
>>> Language('eng').french_name()
|
|
|
|
u'anglais'
|
|
|
|
"""
|
|
|
|
def __init__(self, language):
|
|
|
|
lang = None
|
|
|
|
language = language.lower()
|
|
|
|
if len(language) == 2:
|
|
|
|
lang = lng2_to_lng3.get(language)
|
|
|
|
elif len(language) == 3:
|
|
|
|
lang = language if language in lng3 else lng3term_to_lng3.get(language)
|
|
|
|
else:
|
|
|
|
lang = lng_en_name_to_lng3.get(language) or lng_fr_name_to_lng3.get(language)
|
|
|
|
|
|
|
|
if lang is None:
|
|
|
|
raise ValueError, 'The given string "%s" could not be identified as a language' % language
|
|
|
|
|
|
|
|
self.lang = lang
|
|
|
|
|
|
|
|
def lng2(self):
|
|
|
|
return lng3_to_lng2[self.lang]
|
|
|
|
|
|
|
|
def lng3(self):
|
|
|
|
return self.lang
|
|
|
|
|
|
|
|
def lng3term(self):
|
|
|
|
return lng3_to_lng3term[self.lang]
|
|
|
|
|
|
|
|
def english_name(self):
|
|
|
|
return lng3_to_lng_en_name[self.lang]
|
|
|
|
|
|
|
|
def french_name(self):
|
|
|
|
return lng3_to_lng_fr_name[self.lang]
|
|
|
|
|
|
|
|
|
|
|
|
def __hash__(self):
|
|
|
|
return hash(self.lang)
|
|
|
|
|
|
|
|
def __eq__(self, other):
|
|
|
|
if isinstance(other, Language):
|
|
|
|
return self.lang == other.lang
|
|
|
|
|
|
|
|
if isinstance(other, basestring):
|
|
|
|
try:
|
|
|
|
return self == Language(other)
|
|
|
|
except ValueError:
|
|
|
|
return False
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
def __ne__(self, other):
|
|
|
|
return not self == other
|
|
|
|
|
|
|
|
def __unicode__(self):
|
|
|
|
return lng3_to_lng_en_name[self.lang]
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return unicode(self).encode('utf-8')
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return 'Language(%s)' % self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def search_language(string, lang_filter = None):
|
|
|
|
"""Looks for language patterns, and if found return the language object,
|
|
|
|
its group span and an associated confidence.
|
|
|
|
|
|
|
|
you can specify a list of allowed languages using the lang_filter argument,
|
|
|
|
as in lang_filter = [ 'fr', 'eng', 'spanish' ]
|
|
|
|
|
|
|
|
Assumes there are sentinels at the beginning and end of the string that
|
|
|
|
always allow matching a non-letter delimiting the language.
|
|
|
|
|
|
|
|
>>> search_language('movie [en].avi')
|
|
|
|
(Language(English), (7, 9), 0.80000000000000004)
|
|
|
|
|
|
|
|
>>> search_language('the zen fat cat and the gay mad men got a new fan', lang_filter = ['en', 'fr', 'es'])
|
|
|
|
(None, None, None)
|
|
|
|
"""
|
|
|
|
|
|
|
|
# list of common words which could be interpreted as languages, but which
|
|
|
|
# are far too common to be able to say they represent a language in the
|
|
|
|
# middle of a string (where they most likely carry their commmon meaning)
|
|
|
|
lng_common_words = frozenset([ # english words
|
|
|
|
'is', 'it', 'am', 'mad', 'men', 'man', 'run', 'sin', 'st', 'to',
|
|
|
|
'no', 'non', 'war', 'min', 'new', 'car', 'day', 'bad', 'bat', 'fan',
|
|
|
|
'fry', 'cop', 'zen', 'gay', 'fat', 'cherokee', 'got', 'an', 'as',
|
|
|
|
'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr',
|
|
|
|
# french words
|
|
|
|
'bas', 'de', 'le', 'son', 'vo', 'vf', 'ne', 'ca', 'ce', 'et', 'que',
|
|
|
|
'mal', 'est', 'vol', 'or', 'mon', 'se',
|
|
|
|
# spanish words
|
|
|
|
'la', 'el', 'del', 'por', 'mar',
|
|
|
|
# other
|
|
|
|
'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san'
|
|
|
|
])
|
|
|
|
sep = r'[](){} \._-+'
|
|
|
|
|
|
|
|
if lang_filter:
|
|
|
|
lang_filter = set(Language(l) for l in lang_filter)
|
|
|
|
|
|
|
|
slow = string.lower()
|
|
|
|
confidence = 1.0 # for all of them
|
|
|
|
for lang in lng_all_names:
|
|
|
|
|
|
|
|
if lang in lng_common_words:
|
|
|
|
continue
|
|
|
|
|
|
|
|
pos = slow.find(lang)
|
|
|
|
|
|
|
|
if pos != -1:
|
|
|
|
end = pos + len(lang)
|
|
|
|
# make sure our word is always surrounded by separators
|
|
|
|
if slow[pos-1] not in sep or slow[end] not in sep:
|
|
|
|
continue
|
|
|
|
|
|
|
|
language = Language(slow[pos:end])
|
|
|
|
if lang_filter and language not in lang_filter:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# only allow those languages that have a 2-letter code, those who
|
|
|
|
# don't are too esoteric and probably false matches
|
|
|
|
if language.lang not in lng3_to_lng2:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# confidence depends on lng2, lng3, english name, ...
|
|
|
|
if len(lang) == 2:
|
|
|
|
confidence = 0.8
|
|
|
|
elif len(lang) == 3:
|
|
|
|
confidence = 0.9
|
|
|
|
else:
|
|
|
|
# Note: we could either be really confident that we found a language
|
|
|
|
# or assume that full language names are too common words
|
|
|
|
confidence = 0.3 # going with the low-confidence route here
|
|
|
|
|
|
|
|
return language, (pos, end), confidence
|
|
|
|
|
|
|
|
return None, None, None
|