From f934ef2edec0168d76659b2729adc1fe4402e283 Mon Sep 17 00:00:00 2001 From: Ruud Date: Fri, 4 May 2012 19:52:58 +0200 Subject: [PATCH] Update guessit --- libs/guessit/ISO-3166-1_utf8.txt | 249 +++++++++++++++++++++ libs/guessit/__init__.py | 4 +- libs/guessit/country.py | 113 ++++++++++ libs/guessit/guess.py | 2 +- libs/guessit/language.py | 158 ++++++++++--- libs/guessit/matcher.py | 5 +- libs/guessit/matchtree.py | 2 +- libs/guessit/patterns.py | 7 +- libs/guessit/transfo/__init__.py | 2 +- libs/guessit/transfo/guess_bonus_features.py | 2 +- libs/guessit/transfo/guess_date.py | 2 +- .../transfo/guess_episode_info_from_position.py | 5 +- libs/guessit/transfo/guess_episodes_rexps.py | 2 +- libs/guessit/transfo/guess_filetype.py | 2 +- libs/guessit/transfo/guess_language.py | 2 +- .../transfo/guess_movie_title_from_position.py | 2 +- libs/guessit/transfo/guess_properties.py | 2 +- libs/guessit/transfo/guess_release_group.py | 2 +- libs/guessit/transfo/guess_video_rexps.py | 2 +- libs/guessit/transfo/guess_weak_episodes_rexps.py | 2 +- libs/guessit/transfo/guess_website.py | 2 +- libs/guessit/transfo/guess_year.py | 2 +- libs/guessit/transfo/post_process.py | 2 +- libs/guessit/transfo/split_explicit_groups.py | 2 +- libs/guessit/transfo/split_on_dash.py | 2 +- libs/guessit/transfo/split_path_components.py | 2 +- 26 files changed, 527 insertions(+), 52 deletions(-) create mode 100644 libs/guessit/ISO-3166-1_utf8.txt create mode 100644 libs/guessit/country.py diff --git a/libs/guessit/ISO-3166-1_utf8.txt b/libs/guessit/ISO-3166-1_utf8.txt new file mode 100644 index 0000000..7022040 --- /dev/null +++ b/libs/guessit/ISO-3166-1_utf8.txt @@ -0,0 +1,249 @@ +Afghanistan|AF|AFG|004|ISO 3166-2:AF +Åland Islands|AX|ALA|248|ISO 3166-2:AX +Albania|AL|ALB|008|ISO 3166-2:AL +Algeria|DZ|DZA|012|ISO 3166-2:DZ +American Samoa|AS|ASM|016|ISO 3166-2:AS +Andorra|AD|AND|020|ISO 3166-2:AD +Angola|AO|AGO|024|ISO 3166-2:AO +Anguilla|AI|AIA|660|ISO 3166-2:AI +Antarctica|AQ|ATA|010|ISO 3166-2:AQ +Antigua and Barbuda|AG|ATG|028|ISO 3166-2:AG +Argentina|AR|ARG|032|ISO 3166-2:AR +Armenia|AM|ARM|051|ISO 3166-2:AM +Aruba|AW|ABW|533|ISO 3166-2:AW +Australia|AU|AUS|036|ISO 3166-2:AU +Austria|AT|AUT|040|ISO 3166-2:AT +Azerbaijan|AZ|AZE|031|ISO 3166-2:AZ +Bahamas|BS|BHS|044|ISO 3166-2:BS +Bahrain|BH|BHR|048|ISO 3166-2:BH +Bangladesh|BD|BGD|050|ISO 3166-2:BD +Barbados|BB|BRB|052|ISO 3166-2:BB +Belarus|BY|BLR|112|ISO 3166-2:BY +Belgium|BE|BEL|056|ISO 3166-2:BE +Belize|BZ|BLZ|084|ISO 3166-2:BZ +Benin|BJ|BEN|204|ISO 3166-2:BJ +Bermuda|BM|BMU|060|ISO 3166-2:BM +Bhutan|BT|BTN|064|ISO 3166-2:BT +Bolivia, Plurinational State of|BO|BOL|068|ISO 3166-2:BO +Bonaire, Sint Eustatius and Saba|BQ|BES|535|ISO 3166-2:BQ +Bosnia and Herzegovina|BA|BIH|070|ISO 3166-2:BA +Botswana|BW|BWA|072|ISO 3166-2:BW +Bouvet Island|BV|BVT|074|ISO 3166-2:BV +Brazil|BR|BRA|076|ISO 3166-2:BR +British Indian Ocean Territory|IO|IOT|086|ISO 3166-2:IO +Brunei Darussalam|BN|BRN|096|ISO 3166-2:BN +Bulgaria|BG|BGR|100|ISO 3166-2:BG +Burkina Faso|BF|BFA|854|ISO 3166-2:BF +Burundi|BI|BDI|108|ISO 3166-2:BI +Cambodia|KH|KHM|116|ISO 3166-2:KH +Cameroon|CM|CMR|120|ISO 3166-2:CM +Canada|CA|CAN|124|ISO 3166-2:CA +Cape Verde|CV|CPV|132|ISO 3166-2:CV +Cayman Islands|KY|CYM|136|ISO 3166-2:KY +Central African Republic|CF|CAF|140|ISO 3166-2:CF +Chad|TD|TCD|148|ISO 3166-2:TD +Chile|CL|CHL|152|ISO 3166-2:CL +China|CN|CHN|156|ISO 3166-2:CN +Christmas Island|CX|CXR|162|ISO 3166-2:CX +Cocos (Keeling) Islands|CC|CCK|166|ISO 3166-2:CC +Colombia|CO|COL|170|ISO 3166-2:CO +Comoros|KM|COM|174|ISO 3166-2:KM +Congo|CG|COG|178|ISO 3166-2:CG +Congo, the Democratic Republic of the|CD|COD|180|ISO 3166-2:CD +Cook Islands|CK|COK|184|ISO 3166-2:CK +Costa Rica|CR|CRI|188|ISO 3166-2:CR +Côte d'Ivoire|CI|CIV|384|ISO 3166-2:CI +Croatia|HR|HRV|191|ISO 3166-2:HR +Cuba|CU|CUB|192|ISO 3166-2:CU +Curaçao|CW|CUW|531|ISO 3166-2:CW +Cyprus|CY|CYP|196|ISO 3166-2:CY +Czech Republic|CZ|CZE|203|ISO 3166-2:CZ +Denmark|DK|DNK|208|ISO 3166-2:DK +Djibouti|DJ|DJI|262|ISO 3166-2:DJ +Dominica|DM|DMA|212|ISO 3166-2:DM +Dominican Republic|DO|DOM|214|ISO 3166-2:DO +Ecuador|EC|ECU|218|ISO 3166-2:EC +Egypt|EG|EGY|818|ISO 3166-2:EG +El Salvador|SV|SLV|222|ISO 3166-2:SV +Equatorial Guinea|GQ|GNQ|226|ISO 3166-2:GQ +Eritrea|ER|ERI|232|ISO 3166-2:ER +Estonia|EE|EST|233|ISO 3166-2:EE +Ethiopia|ET|ETH|231|ISO 3166-2:ET +Falkland Islands (Malvinas|FK|FLK|238|ISO 3166-2:FK +Faroe Islands|FO|FRO|234|ISO 3166-2:FO +Fiji|FJ|FJI|242|ISO 3166-2:FJ +Finland|FI|FIN|246|ISO 3166-2:FI +France|FR|FRA|250|ISO 3166-2:FR +French Guiana|GF|GUF|254|ISO 3166-2:GF +French Polynesia|PF|PYF|258|ISO 3166-2:PF +French Southern Territories|TF|ATF|260|ISO 3166-2:TF +Gabon|GA|GAB|266|ISO 3166-2:GA +Gambia|GM|GMB|270|ISO 3166-2:GM +Georgia|GE|GEO|268|ISO 3166-2:GE +Germany|DE|DEU|276|ISO 3166-2:DE +Ghana|GH|GHA|288|ISO 3166-2:GH +Gibraltar|GI|GIB|292|ISO 3166-2:GI +Greece|GR|GRC|300|ISO 3166-2:GR +Greenland|GL|GRL|304|ISO 3166-2:GL +Grenada|GD|GRD|308|ISO 3166-2:GD +Guadeloupe|GP|GLP|312|ISO 3166-2:GP +Guam|GU|GUM|316|ISO 3166-2:GU +Guatemala|GT|GTM|320|ISO 3166-2:GT +Guernsey|GG|GGY|831|ISO 3166-2:GG +Guinea|GN|GIN|324|ISO 3166-2:GN +Guinea-Bissau|GW|GNB|624|ISO 3166-2:GW +Guyana|GY|GUY|328|ISO 3166-2:GY +Haiti|HT|HTI|332|ISO 3166-2:HT +Heard Island and McDonald Islands|HM|HMD|334|ISO 3166-2:HM +Holy See (Vatican City State|VA|VAT|336|ISO 3166-2:VA +Honduras|HN|HND|340|ISO 3166-2:HN +Hong Kong|HK|HKG|344|ISO 3166-2:HK +Hungary|HU|HUN|348|ISO 3166-2:HU +Iceland|IS|ISL|352|ISO 3166-2:IS +India|IN|IND|356|ISO 3166-2:IN +Indonesia|ID|IDN|360|ISO 3166-2:ID +Iran, Islamic Republic of|IR|IRN|364|ISO 3166-2:IR +Iraq|IQ|IRQ|368|ISO 3166-2:IQ +Ireland|IE|IRL|372|ISO 3166-2:IE +Isle of Man|IM|IMN|833|ISO 3166-2:IM +Israel|IL|ISR|376|ISO 3166-2:IL +Italy|IT|ITA|380|ISO 3166-2:IT +Jamaica|JM|JAM|388|ISO 3166-2:JM +Japan|JP|JPN|392|ISO 3166-2:JP +Jersey|JE|JEY|832|ISO 3166-2:JE +Jordan|JO|JOR|400|ISO 3166-2:JO +Kazakhstan|KZ|KAZ|398|ISO 3166-2:KZ +Kenya|KE|KEN|404|ISO 3166-2:KE +Kiribati|KI|KIR|296|ISO 3166-2:KI +Korea, Democratic People's Republic of|KP|PRK|408|ISO 3166-2:KP +Korea, Republic of|KR|KOR|410|ISO 3166-2:KR +Kuwait|KW|KWT|414|ISO 3166-2:KW +Kyrgyzstan|KG|KGZ|417|ISO 3166-2:KG +Lao People's Democratic Republic|LA|LAO|418|ISO 3166-2:LA +Latvia|LV|LVA|428|ISO 3166-2:LV +Lebanon|LB|LBN|422|ISO 3166-2:LB +Lesotho|LS|LSO|426|ISO 3166-2:LS +Liberia|LR|LBR|430|ISO 3166-2:LR +Libya|LY|LBY|434|ISO 3166-2:LY +Liechtenstein|LI|LIE|438|ISO 3166-2:LI +Lithuania|LT|LTU|440|ISO 3166-2:LT +Luxembourg|LU|LUX|442|ISO 3166-2:LU +Macao|MO|MAC|446|ISO 3166-2:MO +Macedonia, the former Yugoslav Republic of|MK|MKD|807|ISO 3166-2:MK +Madagascar|MG|MDG|450|ISO 3166-2:MG +Malawi|MW|MWI|454|ISO 3166-2:MW +Malaysia|MY|MYS|458|ISO 3166-2:MY +Maldives|MV|MDV|462|ISO 3166-2:MV +Mali|ML|MLI|466|ISO 3166-2:ML +Malta|MT|MLT|470|ISO 3166-2:MT +Marshall Islands|MH|MHL|584|ISO 3166-2:MH +Martinique|MQ|MTQ|474|ISO 3166-2:MQ +Mauritania|MR|MRT|478|ISO 3166-2:MR +Mauritius|MU|MUS|480|ISO 3166-2:MU +Mayotte|YT|MYT|175|ISO 3166-2:YT +Mexico|MX|MEX|484|ISO 3166-2:MX +Micronesia, Federated States of|FM|FSM|583|ISO 3166-2:FM +Moldova, Republic of|MD|MDA|498|ISO 3166-2:MD +Monaco|MC|MCO|492|ISO 3166-2:MC +Mongolia|MN|MNG|496|ISO 3166-2:MN +Montenegro|ME|MNE|499|ISO 3166-2:ME +Montserrat|MS|MSR|500|ISO 3166-2:MS +Morocco|MA|MAR|504|ISO 3166-2:MA +Mozambique|MZ|MOZ|508|ISO 3166-2:MZ +Myanmar|MM|MMR|104|ISO 3166-2:MM +Namibia|NA|NAM|516|ISO 3166-2:NA +Nauru|NR|NRU|520|ISO 3166-2:NR +Nepal|NP|NPL|524|ISO 3166-2:NP +Netherlands|NL|NLD|528|ISO 3166-2:NL +New Caledonia|NC|NCL|540|ISO 3166-2:NC +New Zealand|NZ|NZL|554|ISO 3166-2:NZ +Nicaragua|NI|NIC|558|ISO 3166-2:NI +Niger|NE|NER|562|ISO 3166-2:NE +Nigeria|NG|NGA|566|ISO 3166-2:NG +Niue|NU|NIU|570|ISO 3166-2:NU +Norfolk Island|NF|NFK|574|ISO 3166-2:NF +Northern Mariana Islands|MP|MNP|580|ISO 3166-2:MP +Norway|NO|NOR|578|ISO 3166-2:NO +Oman|OM|OMN|512|ISO 3166-2:OM +Pakistan|PK|PAK|586|ISO 3166-2:PK +Palau|PW|PLW|585|ISO 3166-2:PW +Palestinian Territory, Occupied|PS|PSE|275|ISO 3166-2:PS +Panama|PA|PAN|591|ISO 3166-2:PA +Papua New Guinea|PG|PNG|598|ISO 3166-2:PG +Paraguay|PY|PRY|600|ISO 3166-2:PY +Peru|PE|PER|604|ISO 3166-2:PE +Philippines|PH|PHL|608|ISO 3166-2:PH +Pitcairn|PN|PCN|612|ISO 3166-2:PN +Poland|PL|POL|616|ISO 3166-2:PL +Portugal|PT|PRT|620|ISO 3166-2:PT +Puerto Rico|PR|PRI|630|ISO 3166-2:PR +Qatar|QA|QAT|634|ISO 3166-2:QA +Réunion|RE|REU|638|ISO 3166-2:RE +Romania|RO|ROU|642|ISO 3166-2:RO +Russian Federation|RU|RUS|643|ISO 3166-2:RU +Rwanda|RW|RWA|646|ISO 3166-2:RW +Saint Barthélemy|BL|BLM|652|ISO 3166-2:BL +Saint Helena, Ascension and Tristan da Cunha|SH|SHN|654|ISO 3166-2:SH +Saint Kitts and Nevis|KN|KNA|659|ISO 3166-2:KN +Saint Lucia|LC|LCA|662|ISO 3166-2:LC +Saint Martin (French part|MF|MAF|663|ISO 3166-2:MF +Saint Pierre and Miquelon|PM|SPM|666|ISO 3166-2:PM +Saint Vincent and the Grenadines|VC|VCT|670|ISO 3166-2:VC +Samoa|WS|WSM|882|ISO 3166-2:WS +San Marino|SM|SMR|674|ISO 3166-2:SM +Sao Tome and Principe|ST|STP|678|ISO 3166-2:ST +Saudi Arabia|SA|SAU|682|ISO 3166-2:SA +Senegal|SN|SEN|686|ISO 3166-2:SN +Serbia|RS|SRB|688|ISO 3166-2:RS +Seychelles|SC|SYC|690|ISO 3166-2:SC +Sierra Leone|SL|SLE|694|ISO 3166-2:SL +Singapore|SG|SGP|702|ISO 3166-2:SG +Sint Maarten (Dutch part|SX|SXM|534|ISO 3166-2:SX +Slovakia|SK|SVK|703|ISO 3166-2:SK +Slovenia|SI|SVN|705|ISO 3166-2:SI +Solomon Islands|SB|SLB|090|ISO 3166-2:SB +Somalia|SO|SOM|706|ISO 3166-2:SO +South Africa|ZA|ZAF|710|ISO 3166-2:ZA +South Georgia and the South Sandwich Islands|GS|SGS|239|ISO 3166-2:GS +South Sudan|SS|SSD|728|ISO 3166-2:SS +Spain|ES|ESP|724|ISO 3166-2:ES +Sri Lanka|LK|LKA|144|ISO 3166-2:LK +Sudan|SD|SDN|729|ISO 3166-2:SD +Suriname|SR|SUR|740|ISO 3166-2:SR +Svalbard and Jan Mayen|SJ|SJM|744|ISO 3166-2:SJ +Swaziland|SZ|SWZ|748|ISO 3166-2:SZ +Sweden|SE|SWE|752|ISO 3166-2:SE +Switzerland|CH|CHE|756|ISO 3166-2:CH +Syrian Arab Republic|SY|SYR|760|ISO 3166-2:SY +Taiwan, Province of China|TW|TWN|158|ISO 3166-2:TW +Tajikistan|TJ|TJK|762|ISO 3166-2:TJ +Tanzania, United Republic of|TZ|TZA|834|ISO 3166-2:TZ +Thailand|TH|THA|764|ISO 3166-2:TH +Timor-Leste|TL|TLS|626|ISO 3166-2:TL +Togo|TG|TGO|768|ISO 3166-2:TG +Tokelau|TK|TKL|772|ISO 3166-2:TK +Tonga|TO|TON|776|ISO 3166-2:TO +Trinidad and Tobago|TT|TTO|780|ISO 3166-2:TT +Tunisia|TN|TUN|788|ISO 3166-2:TN +Turkey|TR|TUR|792|ISO 3166-2:TR +Turkmenistan|TM|TKM|795|ISO 3166-2:TM +Turks and Caicos Islands|TC|TCA|796|ISO 3166-2:TC +Tuvalu|TV|TUV|798|ISO 3166-2:TV +Uganda|UG|UGA|800|ISO 3166-2:UG +Ukraine|UA|UKR|804|ISO 3166-2:UA +United Arab Emirates|AE|ARE|784|ISO 3166-2:AE +United Kingdom|GB|GBR|826|ISO 3166-2:GB +United States|US|USA|840|ISO 3166-2:US +United States Minor Outlying Islands|UM|UMI|581|ISO 3166-2:UM +Uruguay|UY|URY|858|ISO 3166-2:UY +Uzbekistan|UZ|UZB|860|ISO 3166-2:UZ +Vanuatu|VU|VUT|548|ISO 3166-2:VU +Venezuela, Bolivarian Republic of|VE|VEN|862|ISO 3166-2:VE +Viet Nam|VN|VNM|704|ISO 3166-2:VN +Virgin Islands, British|VG|VGB|092|ISO 3166-2:VG +Virgin Islands, U.S|VI|VIR|850|ISO 3166-2:VI +Wallis and Futuna|WF|WLF|876|ISO 3166-2:WF +Western Sahara|EH|ESH|732|ISO 3166-2:EH +Yemen|YE|YEM|887|ISO 3166-2:YE +Zambia|ZM|ZMB|894|ISO 3166-2:ZM +Zimbabwe|ZW|ZWE|716|ISO 3166-2:ZW diff --git a/libs/guessit/__init__.py b/libs/guessit/__init__.py index a86f71b..9c7c9d0 100644 --- a/libs/guessit/__init__.py +++ b/libs/guessit/__init__.py @@ -18,7 +18,7 @@ # along with this program. If not, see . # -__version__ = '0.3.1' +__version__ = '0.4' __all__ = ['Guess', 'Language', 'guess_file_info', 'guess_video_info', 'guess_movie_info', 'guess_episode_info'] @@ -29,7 +29,7 @@ from guessit.language import Language from guessit.matcher import IterativeMatcher import logging -log = logging.getLogger("guessit") +log = logging.getLogger(__name__) class NullHandler(logging.Handler): diff --git a/libs/guessit/country.py b/libs/guessit/country.py new file mode 100644 index 0000000..f529728 --- /dev/null +++ b/libs/guessit/country.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2012 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import unicode_literals +from guessit import fileutils +import logging + +log = logging.getLogger(__name__) + + +# parsed from http://en.wikipedia.org/wiki/ISO_3166-1 +# +# Description of the fields: +# "An English name, an alpha-2 code (when given), +# an alpha-3 code (when given), a numeric code, and an ISO 31666-2 code +# are all separated by pipe (|) characters." +_iso3166_contents = fileutils.load_file_in_same_dir(__file__, + 'ISO-3166-1_utf8.txt').decode('utf-8') + +country_matrix = [ l.strip().split('|') + for l in _iso3166_contents.strip().split('\n') ] + +country_matrix += [ [ 'Unknown', 'un', 'unk', '', '' ], + [ 'Latin America', '', 'lat', '', '' ] + ] + +country_to_alpha3 = dict((c[0].lower(), c[2].lower()) for c in country_matrix) +country_to_alpha3.update(dict((c[1].lower(), c[2].lower()) for c in country_matrix)) +country_to_alpha3.update(dict((c[2].lower(), c[2].lower()) for c in country_matrix)) + +# add here exceptions / non ISO representations +# Note: remember to put those exceptions in lower-case, they won't work otherwise +country_to_alpha3.update({ 'latinoamérica': 'lat', + 'brazilian': 'bra', + 'españa': 'esp', + 'uk': 'gbr' + }) + +country_alpha3_to_en_name = dict((c[2].lower(), c[0]) for c in country_matrix) +country_alpha3_to_alpha2 = dict((c[2].lower(), c[1].lower()) for c in country_matrix) + + + +class Country(object): + """This class represents a country. + + You can initialize it with pretty much anything, as it knows conversion + from ISO-3166 2-letter and 3-letter codes, and an English name. + """ + + def __init__(self, country, strict=False): + self.alpha3 = country_to_alpha3.get(country.lower()) + + if self.alpha3 is None and strict: + msg = 'The given string "%s" could not be identified as a country' + raise ValueError(msg % country) + + if self.alpha3 is None: + self.alpha3 = 'unk' + + + @property + def alpha2(self): + return country_alpha3_to_alpha2[self.alpha3] + + @property + def english_name(self): + return country_alpha3_to_en_name[self.alpha3] + + def __hash__(self): + return hash(self.alpha3) + + def __eq__(self, other): + if isinstance(other, Country): + return self.alpha3 == other.alpha3 + + if isinstance(other, basestring): + try: + return self == Country(other) + except ValueError: + return False + + return False + + def __ne__(self, other): + return not self == other + + def __unicode__(self): + return self.english_name + + def __str__(self): + return unicode(self).encode('utf-8') + + def __repr__(self): + return 'Country(%s)' % self.english_name + diff --git a/libs/guessit/guess.py b/libs/guessit/guess.py index 9950a12..e25ca1f 100644 --- a/libs/guessit/guess.py +++ b/libs/guessit/guess.py @@ -22,7 +22,7 @@ import json import datetime import logging -log = logging.getLogger("guessit.guess") +log = logging.getLogger(__name__) class Guess(dict): diff --git a/libs/guessit/language.py b/libs/guessit/language.py index 777d0e2..b043346 100644 --- a/libs/guessit/language.py +++ b/libs/guessit/language.py @@ -18,10 +18,18 @@ # along with this program. If not, see . # +from __future__ import unicode_literals from guessit import fileutils +from guessit.country import Country +import re import logging -log = logging.getLogger('guessit.language') +__all__ = [ 'is_iso_language', 'is_language', 'lang_set', 'Language', + 'ALL_LANGUAGES', 'ALL_LANGUAGES_NAMES', 'search_language' ] + + +log = logging.getLogger(__name__) + # downloaded from http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt # @@ -30,9 +38,23 @@ log = logging.getLogger('guessit.language') # an alpha-2 code (when given), an English name, and a French name of a language # are all separated by pipe (|) characters." _iso639_contents = fileutils.load_file_in_same_dir(__file__, - 'ISO-639-2_utf-8.txt') -language_matrix = [ l.strip().decode('utf-8').split('|') - for l in _iso639_contents.split('\n') ] + 'ISO-639-2_utf-8.txt').decode('utf-8') + +# drop the BOM from the beginning of the file +_iso639_contents = _iso639_contents[1:] + +language_matrix = [ l.strip().split('|') + for l in _iso639_contents.strip().split('\n') ] + +language_matrix += [ [ 'unk', '', 'un', 'Unknown', 'inconnu' ] ] + + +# remove unused languages that shadow other common ones with a non-official form +for lang in language_matrix: + if (lang[2] == 'se' or # Northern Sami shadows Swedish + lang[2] == 'br'): # Breton shadows Brazilian + language_matrix.remove(lang) + lng3 = frozenset(l[0] for l in language_matrix if l[0]) lng3term = frozenset(l[1] for l in language_matrix if l[1]) @@ -63,54 +85,126 @@ lng_fr_name_to_lng3 = dict((fr_name.lower(), l[0]) for l in language_matrix if l[4] for fr_name in l[4].split('; ')) +# contains a list of exceptions: strings that should be parsed as a language +# but which are not in an ISO form +lng_exceptions = { 'gr': ('gre', None), + 'greek': ('gre', None), + 'esp': ('spa', None), + 'español': ('spa', None), + 'se': ('swe', None), + 'po': ('pt', 'br'), + 'pob': ('pt', 'br'), + 'br': ('pt', 'br'), + 'brazilian': ('pt', 'br'), + 'català': ('cat', None), + 'cz': ('cze', None), + 'ua': ('ukr', None), + 'cn': ('chi', None), + 'chs': ('chi', None), + 'jp': ('jpn', None) + } + + +def is_iso_language(language): + return language.lower() in lng_all_names def is_language(language): - return language.lower() in lng_all_names + return is_iso_language(language) or language in lng_exceptions + +def lang_set(languages, strict=False): + """Return a set of guessit.Language created from their given string + representation. + + if strict is True, then this will raise an exception if any language + could not be identified. + """ + return set(Language(l, strict=strict) for l in languages) class Language(object): """This class represents a human language. - You can initialize it with pretty much everything, as it knows conversion + You can initialize it with pretty much anything, as it knows conversion from ISO-639 2-letter and 3-letter codes, English and French names. + You can also distinguish languages for specific countries, such as + Portuguese and Brazilian Portuguese. + >>> Language('fr') Language(French) - >>> Language('eng').french_name() + >>> Language('eng').french_name u'anglais' + + >>> Language('pt(br)').country.english_name + u'Brazil' + + >>> Language('Español (Latinoamérica)').country.english_name + u'Latin America' + + >>> Language('Spanish (Latin America)') == Language('Español (Latinoamérica)') + True + + >>> Language('zz', strict=False).english_name + u'Unknown' """ - def __init__(self, language): - lang = None - language = language.lower() + + _with_country_regexp = re.compile('(.*)\((.*)\)') + + def __init__(self, language, country=None, strict=False): + language = language.strip().lower() + if isinstance(language, str): + language = language.decode('utf-8') + with_country = Language._with_country_regexp.match(language) + if with_country: + self.lang = Language(with_country.group(1)).lang + self.country = Country(with_country.group(2)) + return + + self.lang = None + self.country = Country(country) if country else None + if len(language) == 2: - lang = lng2_to_lng3.get(language) + self.lang = lng2_to_lng3.get(language) elif len(language) == 3: - lang = (language - if language in lng3 - else lng3term_to_lng3.get(language)) + self.lang = (language + if language in lng3 + else lng3term_to_lng3.get(language)) else: - lang = (lng_en_name_to_lng3.get(language) or - lng_fr_name_to_lng3.get(language)) + self.lang = (lng_en_name_to_lng3.get(language) or + lng_fr_name_to_lng3.get(language)) - if lang is None: - msg = 'The given string "%s" could not be identified as a language' - raise ValueError(msg % language) + if self.lang is None and language in lng_exceptions: + lang, country = lng_exceptions[language] + self.lang = Language(lang).alpha3 + self.country = Country(country) if country else None - self.lang = lang + msg = 'The given string "%s" could not be identified as a language' % language - def lng2(self): + if self.lang is None and strict: + raise ValueError(msg) + + if self.lang is None: + log.debug(msg) + self.lang = 'unk' + + @property + def alpha2(self): return lng3_to_lng2[self.lang] - def lng3(self): + @property + def alpha3(self): return self.lang - def lng3term(self): + @property + def alpha3term(self): return lng3_to_lng3term[self.lang] + @property def english_name(self): return lng3_to_lng_en_name[self.lang] + @property def french_name(self): return lng3_to_lng_fr_name[self.lang] @@ -132,15 +226,27 @@ class Language(object): def __ne__(self, other): return not self == other + def __nonzero__(self): + return self.lang != 'unk' + def __unicode__(self): - return lng3_to_lng_en_name[self.lang] + if self.country: + return '%s(%s)' % (self.english_name, self.country.alpha2) + else: + return self.english_name def __str__(self): return unicode(self).encode('utf-8') def __repr__(self): - return 'Language(%s)' % self + if self.country: + return 'Language(%s, country=%s)' % (self.english_name, self.country) + else: + return 'Language(%s)' % self.english_name + +ALL_LANGUAGES = frozenset(Language(lng) for lng in lng_all_names) - frozenset([Language('unk')]) +ALL_LANGUAGES_NAMES = lng_all_names def search_language(string, lang_filter=None): """Looks for language patterns, and if found return the language object, @@ -177,7 +283,7 @@ def search_language(string, lang_filter=None): sep = r'[](){} \._-+' if lang_filter: - lang_filter = set(Language(l) for l in lang_filter) + lang_filter = lang_set(lang_filter) slow = ' %s ' % string.lower() confidence = 1.0 # for all of them diff --git a/libs/guessit/matcher.py b/libs/guessit/matcher.py index cac172d..b0a5040 100644 --- a/libs/guessit/matcher.py +++ b/libs/guessit/matcher.py @@ -25,7 +25,7 @@ from guessit.guess import (merge_similar_guesses, merge_all, import copy import logging -log = logging.getLogger("guessit.matcher") +log = logging.getLogger(__name__) class IterativeMatcher(object): @@ -105,7 +105,7 @@ class IterativeMatcher(object): 'guess_release_group', 'guess_properties', 'guess_weak_episodes_rexps', 'guess_language'] else: - strategy = ['guess_date', 'guess_year', 'guess_video_rexps', + strategy = ['guess_date', 'guess_video_rexps', 'guess_website', 'guess_release_group', 'guess_properties', 'guess_language'] @@ -125,6 +125,7 @@ class IterativeMatcher(object): if mtree.guess['type'] in ('episode', 'episodesubtitle'): apply_transfo('guess_episode_info_from_position') else: + apply_transfo('guess_year') apply_transfo('guess_movie_title_from_position') # 6- perform some post-processing steps diff --git a/libs/guessit/matchtree.py b/libs/guessit/matchtree.py index 634cbf7..466e0bb 100644 --- a/libs/guessit/matchtree.py +++ b/libs/guessit/matchtree.py @@ -23,7 +23,7 @@ from guessit.textutils import clean_string, str_fill, to_utf8 from guessit.patterns import group_delimiters import logging -log = logging.getLogger("guessit.matchtree") +log = logging.getLogger(__name__) class BaseMatchTree(object): diff --git a/libs/guessit/patterns.py b/libs/guessit/patterns.py index 4125fb7..4223585 100755 --- a/libs/guessit/patterns.py +++ b/libs/guessit/patterns.py @@ -22,8 +22,9 @@ subtitle_exts = [ 'srt', 'idx', 'sub', 'ssa', 'txt' ] -video_exts = [ 'avi', 'mkv', 'mpg', 'mp4', 'm4v', 'mov', 'ogg', 'ogm', 'ogv', - 'wmv', 'divx' ] +video_exts = ['3g2', '3gp', '3gp2', 'asf', 'avi', 'divx', 'flv', 'm4v', 'mk2', + 'mka', 'mkv', 'mov', 'mp4', 'mp4a', 'mpeg', 'mpg', 'ogg', 'ogm', + 'ogv', 'qt', 'ra', 'ram', 'rm', 'ts', 'wav', 'webm', 'wma', 'wmv'] group_delimiters = [ '()', '[]', '{}' ] @@ -62,6 +63,8 @@ weak_episode_rexps = [ # ... 213 or 0106 ... # ... 2x13 ... (sep + r'[^0-9](?P[0-9]{1,2})\.(?P[0-9]{2})[^0-9]' + sep, (1, -1)), + # ... e13 ... for a mini-series without a season number + (r'e(?P[0-9]{1,4})[^0-9]', (0, -1)), ] non_episode_title = [ 'extras', 'rip' ] diff --git a/libs/guessit/transfo/__init__.py b/libs/guessit/transfo/__init__.py index eb72beb..1bdd09b 100644 --- a/libs/guessit/transfo/__init__.py +++ b/libs/guessit/transfo/__init__.py @@ -23,7 +23,7 @@ from guessit.patterns import canonical_form from guessit.textutils import clean_string import logging -log = logging.getLogger('guessit.transfo') +log = logging.getLogger(__name__) def found_property(node, name, confidence): diff --git a/libs/guessit/transfo/guess_bonus_features.py b/libs/guessit/transfo/guess_bonus_features.py index dcb90b3..73fc7b4 100644 --- a/libs/guessit/transfo/guess_bonus_features.py +++ b/libs/guessit/transfo/guess_bonus_features.py @@ -21,7 +21,7 @@ from guessit.transfo import found_property import logging -log = logging.getLogger("guessit.transfo.guess_bonus_features") +log = logging.getLogger(__name__) def process(mtree): diff --git a/libs/guessit/transfo/guess_date.py b/libs/guessit/transfo/guess_date.py index c72d66a..ded8094 100644 --- a/libs/guessit/transfo/guess_date.py +++ b/libs/guessit/transfo/guess_date.py @@ -22,7 +22,7 @@ from guessit.transfo import SingleNodeGuesser from guessit.date import search_date import logging -log = logging.getLogger("guessit.transfo.guess_date") +log = logging.getLogger(__name__) def guess_date(string): diff --git a/libs/guessit/transfo/guess_episode_info_from_position.py b/libs/guessit/transfo/guess_episode_info_from_position.py index fe1a752..7b4f43f 100644 --- a/libs/guessit/transfo/guess_episode_info_from_position.py +++ b/libs/guessit/transfo/guess_episode_info_from_position.py @@ -22,7 +22,7 @@ from guessit.transfo import found_property from guessit.patterns import non_episode_title, unlikely_series import logging -log = logging.getLogger("guessit.transfo.guess_episode_info_from_position") +log = logging.getLogger(__name__) def match_from_epnum_position(mtree, node): @@ -112,6 +112,9 @@ def process(mtree): if len(title_candidates) >= 2: found_property(title_candidates[0], 'series', 0.4) found_property(title_candidates[1], 'title', 0.4) + elif len(title_candidates) == 1: + # but if there's only one candidate, it's probably the series name + found_property(title_candidates[0], 'series', 0.4) # if we only have 1 remaining valid group in the folder containing the # file, then it's likely that it is the series name diff --git a/libs/guessit/transfo/guess_episodes_rexps.py b/libs/guessit/transfo/guess_episodes_rexps.py index 46dbc59..dfaa944 100644 --- a/libs/guessit/transfo/guess_episodes_rexps.py +++ b/libs/guessit/transfo/guess_episodes_rexps.py @@ -24,7 +24,7 @@ from guessit.patterns import episode_rexps import re import logging -log = logging.getLogger("guessit.transfo.guess_episodes_rexps") +log = logging.getLogger(__name__) def guess_episodes_rexps(string): diff --git a/libs/guessit/transfo/guess_filetype.py b/libs/guessit/transfo/guess_filetype.py index 32bdc13..bf0a80a 100644 --- a/libs/guessit/transfo/guess_filetype.py +++ b/libs/guessit/transfo/guess_filetype.py @@ -26,7 +26,7 @@ import re import mimetypes import logging -log = logging.getLogger("guessit.transfo.guess_filetype") +log = logging.getLogger(__name__) def guess_filetype(filename, filetype): diff --git a/libs/guessit/transfo/guess_language.py b/libs/guessit/transfo/guess_language.py index 62f47d8..aa1431b 100644 --- a/libs/guessit/transfo/guess_language.py +++ b/libs/guessit/transfo/guess_language.py @@ -24,7 +24,7 @@ from guessit.language import search_language from guessit.textutils import clean_string import logging -log = logging.getLogger("guessit.transfo.guess_language") +log = logging.getLogger(__name__) def guess_language(string): diff --git a/libs/guessit/transfo/guess_movie_title_from_position.py b/libs/guessit/transfo/guess_movie_title_from_position.py index dea56d6..55289c8 100644 --- a/libs/guessit/transfo/guess_movie_title_from_position.py +++ b/libs/guessit/transfo/guess_movie_title_from_position.py @@ -21,7 +21,7 @@ from guessit import Guess import logging -log = logging.getLogger("guessit.transfo.guess_movie_title_from_position") +log = logging.getLogger(__name__) def process(mtree): diff --git a/libs/guessit/transfo/guess_properties.py b/libs/guessit/transfo/guess_properties.py index 3822d22..02d0cad 100644 --- a/libs/guessit/transfo/guess_properties.py +++ b/libs/guessit/transfo/guess_properties.py @@ -22,7 +22,7 @@ from guessit.transfo import SingleNodeGuesser from guessit.patterns import find_properties import logging -log = logging.getLogger("guessit.transfo.guess_properties") +log = logging.getLogger(__name__) def guess_properties(string): diff --git a/libs/guessit/transfo/guess_release_group.py b/libs/guessit/transfo/guess_release_group.py index 9ec609d..54a7148 100644 --- a/libs/guessit/transfo/guess_release_group.py +++ b/libs/guessit/transfo/guess_release_group.py @@ -22,7 +22,7 @@ from guessit.transfo import SingleNodeGuesser import re import logging -log = logging.getLogger("guessit.transfo.guess_release_group") +log = logging.getLogger(__name__) def guess_release_group(string): diff --git a/libs/guessit/transfo/guess_video_rexps.py b/libs/guessit/transfo/guess_video_rexps.py index 36723c8..697a6af 100644 --- a/libs/guessit/transfo/guess_video_rexps.py +++ b/libs/guessit/transfo/guess_video_rexps.py @@ -24,7 +24,7 @@ from guessit.patterns import video_rexps, sep import re import logging -log = logging.getLogger("guessit.transfo.guess_video_rexps") +log = logging.getLogger(__name__) def guess_video_rexps(string): diff --git a/libs/guessit/transfo/guess_weak_episodes_rexps.py b/libs/guessit/transfo/guess_weak_episodes_rexps.py index 8fffe17..57c9f44 100644 --- a/libs/guessit/transfo/guess_weak_episodes_rexps.py +++ b/libs/guessit/transfo/guess_weak_episodes_rexps.py @@ -24,7 +24,7 @@ from guessit.patterns import weak_episode_rexps import re import logging -log = logging.getLogger("guessit.transfo.guess_weak_episodes_rexps") +log = logging.getLogger(__name__) def guess_weak_episodes_rexps(string, node): diff --git a/libs/guessit/transfo/guess_website.py b/libs/guessit/transfo/guess_website.py index a169f97..638f7d2 100644 --- a/libs/guessit/transfo/guess_website.py +++ b/libs/guessit/transfo/guess_website.py @@ -22,7 +22,7 @@ from guessit.transfo import SingleNodeGuesser from guessit.patterns import websites import logging -log = logging.getLogger("guessit.transfo.guess_website") +log = logging.getLogger(__name__) def guess_website(string): diff --git a/libs/guessit/transfo/guess_year.py b/libs/guessit/transfo/guess_year.py index 7a47ecf..7a90111 100644 --- a/libs/guessit/transfo/guess_year.py +++ b/libs/guessit/transfo/guess_year.py @@ -22,7 +22,7 @@ from guessit.transfo import SingleNodeGuesser from guessit.date import search_year import logging -log = logging.getLogger("guessit.transfo.guess_year") +log = logging.getLogger(__name__) def guess_year(string): diff --git a/libs/guessit/transfo/post_process.py b/libs/guessit/transfo/post_process.py index 0b5a4df..f08bbb2 100644 --- a/libs/guessit/transfo/post_process.py +++ b/libs/guessit/transfo/post_process.py @@ -21,7 +21,7 @@ from guessit.patterns import subtitle_exts import logging -log = logging.getLogger("guessit.transfo.post_process") +log = logging.getLogger(__name__) def process(mtree): diff --git a/libs/guessit/transfo/split_explicit_groups.py b/libs/guessit/transfo/split_explicit_groups.py index 797a886..f99ff19 100644 --- a/libs/guessit/transfo/split_explicit_groups.py +++ b/libs/guessit/transfo/split_explicit_groups.py @@ -22,7 +22,7 @@ from guessit.textutils import find_first_level_groups from guessit.patterns import group_delimiters import logging -log = logging.getLogger("guessit.transfo.split_explicit_groups") +log = logging.getLogger(__name__) def process(mtree): diff --git a/libs/guessit/transfo/split_on_dash.py b/libs/guessit/transfo/split_on_dash.py index fc10c49..0f2c34b 100644 --- a/libs/guessit/transfo/split_on_dash.py +++ b/libs/guessit/transfo/split_on_dash.py @@ -22,7 +22,7 @@ from guessit.patterns import sep import re import logging -log = logging.getLogger("guessit.transfo.split_on_dash") +log = logging.getLogger(__name__) def process(mtree): diff --git a/libs/guessit/transfo/split_path_components.py b/libs/guessit/transfo/split_path_components.py index 0f8d1a5..9f7ec9b 100644 --- a/libs/guessit/transfo/split_path_components.py +++ b/libs/guessit/transfo/split_path_components.py @@ -22,7 +22,7 @@ from guessit import fileutils import os.path import logging -log = logging.getLogger("guessit.transfo.split_path_components") +log = logging.getLogger(__name__) def process(mtree):