16 changed files with 1 additions and 1460 deletions
@ -1,78 +0,0 @@ |
|||||
#!/usr/bin/env python |
|
||||
# encoding: utf-8 |
|
||||
""" |
|
||||
StringMatcher.py |
|
||||
|
|
||||
ported from python-Levenshtein |
|
||||
[https://github.com/miohtama/python-Levenshtein] |
|
||||
""" |
|
||||
|
|
||||
from Levenshtein import * |
|
||||
from warnings import warn |
|
||||
|
|
||||
class StringMatcher: |
|
||||
"""A SequenceMatcher-like class built on the top of Levenshtein""" |
|
||||
|
|
||||
def _reset_cache(self): |
|
||||
self._ratio = self._distance = None |
|
||||
self._opcodes = self._editops = self._matching_blocks = None |
|
||||
|
|
||||
def __init__(self, isjunk=None, seq1='', seq2=''): |
|
||||
if isjunk: |
|
||||
warn("isjunk not NOT implemented, it will be ignored") |
|
||||
self._str1, self._str2 = seq1, seq2 |
|
||||
self._reset_cache() |
|
||||
|
|
||||
def set_seqs(self, seq1, seq2): |
|
||||
self._str1, self._str2 = seq1, seq2 |
|
||||
self._reset_cache() |
|
||||
|
|
||||
def set_seq1(self, seq1): |
|
||||
self._str1 = seq1 |
|
||||
self._reset_cache() |
|
||||
|
|
||||
def set_seq2(self, seq2): |
|
||||
self._str2 = seq2 |
|
||||
self._reset_cache() |
|
||||
|
|
||||
def get_opcodes(self): |
|
||||
if not self._opcodes: |
|
||||
if self._editops: |
|
||||
self._opcodes = opcodes(self._editops, self._str1, self._str2) |
|
||||
else: |
|
||||
self._opcodes = opcodes(self._str1, self._str2) |
|
||||
return self._opcodes |
|
||||
|
|
||||
def get_editops(self): |
|
||||
if not self._editops: |
|
||||
if self._opcodes: |
|
||||
self._editops = editops(self._opcodes, self._str1, self._str2) |
|
||||
else: |
|
||||
self._editops = editops(self._str1, self._str2) |
|
||||
return self._editops |
|
||||
|
|
||||
def get_matching_blocks(self): |
|
||||
if not self._matching_blocks: |
|
||||
self._matching_blocks = matching_blocks(self.get_opcodes(), |
|
||||
self._str1, self._str2) |
|
||||
return self._matching_blocks |
|
||||
|
|
||||
def ratio(self): |
|
||||
if not self._ratio: |
|
||||
self._ratio = ratio(self._str1, self._str2) |
|
||||
return self._ratio |
|
||||
|
|
||||
def quick_ratio(self): |
|
||||
# This is usually quick enough :o) |
|
||||
if not self._ratio: |
|
||||
self._ratio = ratio(self._str1, self._str2) |
|
||||
return self._ratio |
|
||||
|
|
||||
def real_quick_ratio(self): |
|
||||
len1, len2 = len(self._str1), len(self._str2) |
|
||||
return 2.0 * min(len1, len2) / (len1 + len2) |
|
||||
|
|
||||
def distance(self): |
|
||||
if not self._distance: |
|
||||
self._distance = distance(self._str1, self._str2) |
|
||||
return self._distance |
|
@ -1,263 +0,0 @@ |
|||||
#!/usr/bin/env python |
|
||||
# encoding: utf-8 |
|
||||
""" |
|
||||
fuzz.py |
|
||||
|
|
||||
Copyright (c) 2011 Adam Cohen |
|
||||
|
|
||||
Permission is hereby granted, free of charge, to any person obtaining |
|
||||
a copy of this software and associated documentation files (the |
|
||||
"Software"), to deal in the Software without restriction, including |
|
||||
without limitation the rights to use, copy, modify, merge, publish, |
|
||||
distribute, sublicense, and/or sell copies of the Software, and to |
|
||||
permit persons to whom the Software is furnished to do so, subject to |
|
||||
the following conditions: |
|
||||
|
|
||||
The above copyright notice and this permission notice shall be |
|
||||
included in all copies or substantial portions of the Software. |
|
||||
|
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE |
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION |
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION |
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
|
||||
""" |
|
||||
from __future__ import unicode_literals |
|
||||
|
|
||||
try: |
|
||||
from StringMatcher import StringMatcher as SequenceMatcher |
|
||||
except: |
|
||||
from difflib import SequenceMatcher |
|
||||
|
|
||||
from . import utils |
|
||||
|
|
||||
|
|
||||
########################### |
|
||||
# Basic Scoring Functions # |
|
||||
########################### |
|
||||
|
|
||||
|
|
||||
def ratio(s1, s2): |
|
||||
|
|
||||
if s1 is None: |
|
||||
raise TypeError("s1 is None") |
|
||||
if s2 is None: |
|
||||
raise TypeError("s2 is None") |
|
||||
s1, s2 = utils.make_type_consistent(s1, s2) |
|
||||
if len(s1) == 0 or len(s2) == 0: |
|
||||
return 0 |
|
||||
|
|
||||
m = SequenceMatcher(None, s1, s2) |
|
||||
return utils.intr(100 * m.ratio()) |
|
||||
|
|
||||
|
|
||||
# todo: skip duplicate indexes for a little more speed |
|
||||
def partial_ratio(s1, s2): |
|
||||
|
|
||||
if s1 is None: |
|
||||
raise TypeError("s1 is None") |
|
||||
if s2 is None: |
|
||||
raise TypeError("s2 is None") |
|
||||
s1, s2 = utils.make_type_consistent(s1, s2) |
|
||||
if len(s1) == 0 or len(s2) == 0: |
|
||||
return 0 |
|
||||
|
|
||||
if len(s1) <= len(s2): |
|
||||
shorter = s1 |
|
||||
longer = s2 |
|
||||
else: |
|
||||
shorter = s2 |
|
||||
longer = s1 |
|
||||
|
|
||||
m = SequenceMatcher(None, shorter, longer) |
|
||||
blocks = m.get_matching_blocks() |
|
||||
|
|
||||
# each block represents a sequence of matching characters in a string |
|
||||
# of the form (idx_1, idx_2, len) |
|
||||
# the best partial match will block align with at least one of those blocks |
|
||||
# e.g. shorter = "abcd", longer = XXXbcdeEEE |
|
||||
# block = (1,3,3) |
|
||||
# best score === ratio("abcd", "Xbcd") |
|
||||
scores = [] |
|
||||
for block in blocks: |
|
||||
long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 |
|
||||
long_end = long_start + len(shorter) |
|
||||
long_substr = longer[long_start:long_end] |
|
||||
|
|
||||
m2 = SequenceMatcher(None, shorter, long_substr) |
|
||||
r = m2.ratio() |
|
||||
if r > .995: |
|
||||
return 100 |
|
||||
else: |
|
||||
scores.append(r) |
|
||||
|
|
||||
return int(100 * max(scores)) |
|
||||
|
|
||||
|
|
||||
############################## |
|
||||
# Advanced Scoring Functions # |
|
||||
############################## |
|
||||
|
|
||||
# Sorted Token |
|
||||
# find all alphanumeric tokens in the string |
|
||||
# sort those tokens and take ratio of resulting joined strings |
|
||||
# controls for unordered string elements |
|
||||
def _token_sort(s1, s2, partial=True, force_ascii=True): |
|
||||
|
|
||||
if s1 is None: |
|
||||
raise TypeError("s1 is None") |
|
||||
if s2 is None: |
|
||||
raise TypeError("s2 is None") |
|
||||
|
|
||||
# pull tokens |
|
||||
tokens1 = utils.full_process(s1, force_ascii=force_ascii).split() |
|
||||
tokens2 = utils.full_process(s2, force_ascii=force_ascii).split() |
|
||||
|
|
||||
# sort tokens and join |
|
||||
sorted1 = " ".join(sorted(tokens1)) |
|
||||
sorted2 = " ".join(sorted(tokens2)) |
|
||||
|
|
||||
sorted1 = sorted1.strip() |
|
||||
sorted2 = sorted2.strip() |
|
||||
|
|
||||
if partial: |
|
||||
return partial_ratio(sorted1, sorted2) |
|
||||
else: |
|
||||
return ratio(sorted1, sorted2) |
|
||||
|
|
||||
|
|
||||
def token_sort_ratio(s1, s2, force_ascii=True): |
|
||||
return _token_sort(s1, s2, partial=False, force_ascii=force_ascii) |
|
||||
|
|
||||
|
|
||||
def partial_token_sort_ratio(s1, s2, force_ascii=True): |
|
||||
return _token_sort(s1, s2, partial=True, force_ascii=force_ascii) |
|
||||
|
|
||||
|
|
||||
# Token Set |
|
||||
# find all alphanumeric tokens in each string...treat them as a set |
|
||||
# construct two strings of the form |
|
||||
# <sorted_intersection><sorted_remainder> |
|
||||
# take ratios of those two strings |
|
||||
# controls for unordered partial matches |
|
||||
def _token_set(s1, s2, partial=True, force_ascii=True): |
|
||||
|
|
||||
if s1 is None: |
|
||||
raise TypeError("s1 is None") |
|
||||
if s2 is None: |
|
||||
raise TypeError("s2 is None") |
|
||||
|
|
||||
p1 = utils.full_process(s1, force_ascii=force_ascii) |
|
||||
p2 = utils.full_process(s2, force_ascii=force_ascii) |
|
||||
|
|
||||
if not utils.validate_string(p1): |
|
||||
return 0 |
|
||||
if not utils.validate_string(p2): |
|
||||
return 0 |
|
||||
|
|
||||
# pull tokens |
|
||||
tokens1 = set(utils.full_process(p1).split()) |
|
||||
tokens2 = set(utils.full_process(p2).split()) |
|
||||
|
|
||||
intersection = tokens1.intersection(tokens2) |
|
||||
diff1to2 = tokens1.difference(tokens2) |
|
||||
diff2to1 = tokens2.difference(tokens1) |
|
||||
|
|
||||
sorted_sect = " ".join(sorted(intersection)) |
|
||||
sorted_1to2 = " ".join(sorted(diff1to2)) |
|
||||
sorted_2to1 = " ".join(sorted(diff2to1)) |
|
||||
|
|
||||
combined_1to2 = sorted_sect + " " + sorted_1to2 |
|
||||
combined_2to1 = sorted_sect + " " + sorted_2to1 |
|
||||
|
|
||||
# strip |
|
||||
sorted_sect = sorted_sect.strip() |
|
||||
combined_1to2 = combined_1to2.strip() |
|
||||
combined_2to1 = combined_2to1.strip() |
|
||||
|
|
||||
pairwise = [ |
|
||||
ratio(sorted_sect, combined_1to2), |
|
||||
ratio(sorted_sect, combined_2to1), |
|
||||
ratio(combined_1to2, combined_2to1) |
|
||||
] |
|
||||
return max(pairwise) |
|
||||
|
|
||||
|
|
||||
def token_set_ratio(s1, s2, force_ascii=True): |
|
||||
return _token_set(s1, s2, partial=False, force_ascii=force_ascii) |
|
||||
|
|
||||
|
|
||||
def partial_token_set_ratio(s1, s2, force_ascii=True): |
|
||||
return _token_set(s1, s2, partial=True, force_ascii=force_ascii) |
|
||||
|
|
||||
|
|
||||
# TODO: numerics |
|
||||
|
|
||||
################### |
|
||||
# Combination API # |
|
||||
################### |
|
||||
|
|
||||
# q is for quick |
|
||||
def QRatio(s1, s2, force_ascii=True): |
|
||||
|
|
||||
p1 = utils.full_process(s1, force_ascii=force_ascii) |
|
||||
p2 = utils.full_process(s2, force_ascii=force_ascii) |
|
||||
|
|
||||
if not utils.validate_string(p1): |
|
||||
return 0 |
|
||||
if not utils.validate_string(p2): |
|
||||
return 0 |
|
||||
|
|
||||
return ratio(p1, p2) |
|
||||
|
|
||||
|
|
||||
def UQRatio(s1, s2): |
|
||||
return QRatio(s1, s2, force_ascii=False) |
|
||||
|
|
||||
|
|
||||
# w is for weighted |
|
||||
def WRatio(s1, s2, force_ascii=True): |
|
||||
|
|
||||
p1 = utils.full_process(s1, force_ascii=force_ascii) |
|
||||
p2 = utils.full_process(s2, force_ascii=force_ascii) |
|
||||
|
|
||||
if not utils.validate_string(p1): |
|
||||
return 0 |
|
||||
if not utils.validate_string(p2): |
|
||||
return 0 |
|
||||
|
|
||||
# should we look at partials? |
|
||||
try_partial = True |
|
||||
unbase_scale = .95 |
|
||||
partial_scale = .90 |
|
||||
|
|
||||
base = ratio(p1, p2) |
|
||||
len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2)) |
|
||||
|
|
||||
# if strings are similar length, don't use partials |
|
||||
if len_ratio < 1.5: |
|
||||
try_partial = False |
|
||||
|
|
||||
# if one string is much much shorter than the other |
|
||||
if len_ratio > 8: |
|
||||
partial_scale = .6 |
|
||||
|
|
||||
if try_partial: |
|
||||
partial = partial_ratio(p1, p2) * partial_scale |
|
||||
ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \ |
|
||||
* unbase_scale * partial_scale |
|
||||
ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \ |
|
||||
* unbase_scale * partial_scale |
|
||||
|
|
||||
return int(max(base, partial, ptsor, ptser)) |
|
||||
else: |
|
||||
tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale |
|
||||
tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale |
|
||||
|
|
||||
return int(max(base, tsor, tser)) |
|
||||
|
|
||||
|
|
||||
def UWRatio(s1, s2): |
|
||||
return WRatio(s1, s2, force_ascii=False) |
|
@ -1,119 +0,0 @@ |
|||||
#!/usr/bin/env python |
|
||||
# encoding: utf-8 |
|
||||
""" |
|
||||
process.py |
|
||||
|
|
||||
Copyright (c) 2011 Adam Cohen |
|
||||
|
|
||||
Permission is hereby granted, free of charge, to any person obtaining |
|
||||
a copy of this software and associated documentation files (the |
|
||||
"Software"), to deal in the Software without restriction, including |
|
||||
without limitation the rights to use, copy, modify, merge, publish, |
|
||||
distribute, sublicense, and/or sell copies of the Software, and to |
|
||||
permit persons to whom the Software is furnished to do so, subject to |
|
||||
the following conditions: |
|
||||
|
|
||||
The above copyright notice and this permission notice shall be |
|
||||
included in all copies or substantial portions of the Software. |
|
||||
|
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE |
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION |
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION |
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
|
||||
""" |
|
||||
import itertools |
|
||||
|
|
||||
from . import fuzz |
|
||||
from . import utils |
|
||||
|
|
||||
|
|
||||
def extract(query, choices, processor=None, scorer=None, limit=5): |
|
||||
"""Find best matches in a list of choices, return a list of tuples |
|
||||
containing the match and it's score. |
|
||||
|
|
||||
Arguments: |
|
||||
query -- an object representing the thing we want to find |
|
||||
choices -- a list of objects we are attempting to extract |
|
||||
values from |
|
||||
scorer -- f(OBJ, QUERY) --> INT. We will return the objects |
|
||||
with the highest score by default, we use |
|
||||
score.WRatio() and both OBJ and QUERY should be |
|
||||
strings |
|
||||
processor -- f(OBJ_A) --> OBJ_B, where the output is an input |
|
||||
to scorer for example, "processor = lambda x: |
|
||||
x[0]" would return the first element in a |
|
||||
collection x (of, say, strings) this would then |
|
||||
be used in the scoring collection by default, we |
|
||||
use utils.full_process() |
|
||||
|
|
||||
""" |
|
||||
if choices is None or len(choices) == 0: |
|
||||
return [] |
|
||||
|
|
||||
# default, turn whatever the choice is into a workable string |
|
||||
if processor is None: |
|
||||
processor = lambda x: utils.full_process(x) |
|
||||
|
|
||||
# default: wratio |
|
||||
if scorer is None: |
|
||||
scorer = fuzz.WRatio |
|
||||
|
|
||||
sl = list() |
|
||||
|
|
||||
for choice in choices: |
|
||||
processed = processor(choice) |
|
||||
score = scorer(query, processed) |
|
||||
tuple = (choice, score) |
|
||||
sl.append(tuple) |
|
||||
|
|
||||
sl.sort(key=lambda i: i[1], reverse=True) |
|
||||
return sl[:limit] |
|
||||
|
|
||||
|
|
||||
def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, limit=5): |
|
||||
"""Find best matches above a score in a list of choices, return a |
|
||||
list of tuples containing the match and it's score. |
|
||||
|
|
||||
Convenience method which returns the choices with best scores, see |
|
||||
extract() for full arguments list |
|
||||
|
|
||||
Optional parameter: score_cutoff. |
|
||||
If the choice has a score of less than or equal to score_cutoff |
|
||||
it will not be included on result list |
|
||||
|
|
||||
""" |
|
||||
|
|
||||
best_list = extract(query, choices, processor, scorer, limit) |
|
||||
if len(best_list) > 0: |
|
||||
return list(itertools.takewhile(lambda x: x[1] > score_cutoff, best_list)) |
|
||||
else: |
|
||||
return [] |
|
||||
|
|
||||
|
|
||||
def extractOne(query, choices, processor=None, scorer=None, score_cutoff=0): |
|
||||
"""Find the best match above a score in a list of choices, return a |
|
||||
tuple containing the match and it's score if it's above the treshold |
|
||||
or None. |
|
||||
|
|
||||
Convenience method which returns the single best choice, see |
|
||||
extract() for full arguments list |
|
||||
|
|
||||
Optional parameter: score_cutoff. |
|
||||
If the best choice has a score of less than or equal to |
|
||||
score_cutoff we will return none (intuition: not a good enough |
|
||||
match) |
|
||||
|
|
||||
""" |
|
||||
|
|
||||
best_list = extract(query, choices, processor, scorer, limit=1) |
|
||||
if len(best_list) > 0: |
|
||||
best = best_list[0] |
|
||||
if best[1] > score_cutoff: |
|
||||
return best |
|
||||
else: |
|
||||
return None |
|
||||
else: |
|
||||
return None |
|
@ -1,41 +0,0 @@ |
|||||
from __future__ import unicode_literals |
|
||||
import re |
|
||||
|
|
||||
|
|
||||
class StringProcessor(object): |
|
||||
""" |
|
||||
This class defines method to process strings in the most |
|
||||
efficient way. Ideally all the methods below use unicode strings |
|
||||
for both input and output. |
|
||||
""" |
|
||||
|
|
||||
@classmethod |
|
||||
def replace_non_letters_non_numbers_with_whitespace(cls, a_string): |
|
||||
""" |
|
||||
This function replaces any sequence of non letters and non |
|
||||
numbers with a single white space. |
|
||||
""" |
|
||||
regex = re.compile(r"(?ui)\W") |
|
||||
return regex.sub(" ", a_string) |
|
||||
|
|
||||
@classmethod |
|
||||
def strip(cls, a_string): |
|
||||
""" |
|
||||
This function strips leading and trailing white space. |
|
||||
""" |
|
||||
|
|
||||
return a_string.strip() |
|
||||
|
|
||||
@classmethod |
|
||||
def to_lower_case(cls, a_string): |
|
||||
""" |
|
||||
This function returns the lower-cased version of the string given. |
|
||||
""" |
|
||||
return a_string.lower() |
|
||||
|
|
||||
@classmethod |
|
||||
def to_upper_case(cls, a_string): |
|
||||
""" |
|
||||
This function returns the upper-cased version of the string given. |
|
||||
""" |
|
||||
return a_string.upper() |
|
@ -1,76 +0,0 @@ |
|||||
from __future__ import unicode_literals |
|
||||
import sys |
|
||||
|
|
||||
from fuzzywuzzy.string_processing import StringProcessor |
|
||||
|
|
||||
|
|
||||
PY3 = sys.version_info[0] == 3 |
|
||||
|
|
||||
|
|
||||
def validate_string(s): |
|
||||
try: |
|
||||
if len(s) > 0: |
|
||||
return True |
|
||||
else: |
|
||||
return False |
|
||||
except: |
|
||||
return False |
|
||||
|
|
||||
bad_chars = str('') # ascii dammit! |
|
||||
for i in range(128, 256): |
|
||||
bad_chars += chr(i) |
|
||||
if PY3: |
|
||||
translation_table = dict((ord(c), None) for c in bad_chars) |
|
||||
|
|
||||
|
|
||||
def asciionly(s): |
|
||||
if PY3: |
|
||||
return s.translate(translation_table) |
|
||||
else: |
|
||||
return s.translate(None, bad_chars) |
|
||||
|
|
||||
|
|
||||
def asciidammit(s): |
|
||||
if type(s) is str: |
|
||||
return asciionly(s) |
|
||||
elif type(s) is unicode: |
|
||||
return asciionly(s.encode('ascii', 'ignore')) |
|
||||
else: |
|
||||
return asciidammit(unicode(s)) |
|
||||
|
|
||||
|
|
||||
def make_type_consistent(s1, s2): |
|
||||
if isinstance(s1, str) and isinstance(s2, str): |
|
||||
return s1, s2 |
|
||||
|
|
||||
elif isinstance(s1, unicode) and isinstance(s2, unicode): |
|
||||
return s1, s2 |
|
||||
|
|
||||
else: |
|
||||
return unicode(s1), unicode(s2) |
|
||||
|
|
||||
|
|
||||
def full_process(s, force_ascii=False): |
|
||||
"""Process string by |
|
||||
-- removing all but letters and numbers |
|
||||
-- trim whitespace |
|
||||
-- force to lower case |
|
||||
if force_ascii == True, force convert to ascii""" |
|
||||
|
|
||||
if s is None: |
|
||||
return "" |
|
||||
|
|
||||
if force_ascii: |
|
||||
s = asciidammit(s) |
|
||||
# Keep only Letters and Numbres (see Unicode docs). |
|
||||
string_out = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s) |
|
||||
# Force into lowercase. |
|
||||
string_out = StringProcessor.to_lower_case(string_out) |
|
||||
# Remove leading and trailing whitespaces. |
|
||||
string_out = StringProcessor.strip(string_out) |
|
||||
return string_out |
|
||||
|
|
||||
|
|
||||
def intr(n): |
|
||||
'''Returns a correctly rounded integer''' |
|
||||
return int(round(n)) |
|
@ -1,18 +0,0 @@ |
|||||
from pysrt.srttime import SubRipTime |
|
||||
from pysrt.srtitem import SubRipItem |
|
||||
from pysrt.srtfile import SubRipFile |
|
||||
from pysrt.srtexc import Error, InvalidItem, InvalidTimeString |
|
||||
from pysrt.version import VERSION, VERSION_STRING |
|
||||
|
|
||||
__all__ = [ |
|
||||
'SubRipFile', 'SubRipItem', 'SubRipFile', 'SUPPORT_UTF_32_LE', |
|
||||
'SUPPORT_UTF_32_BE', 'InvalidItem', 'InvalidTimeString' |
|
||||
] |
|
||||
|
|
||||
ERROR_PASS = SubRipFile.ERROR_PASS |
|
||||
ERROR_LOG = SubRipFile.ERROR_LOG |
|
||||
ERROR_RAISE = SubRipFile.ERROR_RAISE |
|
||||
|
|
||||
open = SubRipFile.open |
|
||||
stream = SubRipFile.stream |
|
||||
from_string = SubRipFile.from_string |
|
@ -1,218 +0,0 @@ |
|||||
#!/usr/bin/env python |
|
||||
# -*- coding: utf-8 -*- |
|
||||
# pylint: disable-all |
|
||||
|
|
||||
import os |
|
||||
import re |
|
||||
import sys |
|
||||
import codecs |
|
||||
import shutil |
|
||||
import argparse |
|
||||
from textwrap import dedent |
|
||||
|
|
||||
from chardet import detect |
|
||||
from pysrt import SubRipFile, SubRipTime, VERSION_STRING |
|
||||
|
|
||||
def underline(string): |
|
||||
return "\033[4m%s\033[0m" % string |
|
||||
|
|
||||
|
|
||||
class TimeAwareArgumentParser(argparse.ArgumentParser): |
|
||||
|
|
||||
RE_TIME_REPRESENTATION = re.compile(r'^\-?(\d+[hms]{0,2}){1,4}$') |
|
||||
|
|
||||
def parse_args(self, args=None, namespace=None): |
|
||||
time_index = -1 |
|
||||
for index, arg in enumerate(args): |
|
||||
match = self.RE_TIME_REPRESENTATION.match(arg) |
|
||||
if match: |
|
||||
time_index = index |
|
||||
break |
|
||||
|
|
||||
if time_index >= 0: |
|
||||
args.insert(time_index, '--') |
|
||||
|
|
||||
return super(TimeAwareArgumentParser, self).parse_args(args, namespace) |
|
||||
|
|
||||
|
|
||||
class SubRipShifter(object): |
|
||||
|
|
||||
BACKUP_EXTENSION = '.bak' |
|
||||
RE_TIME_STRING = re.compile(r'(\d+)([hms]{0,2})') |
|
||||
UNIT_RATIOS = { |
|
||||
'ms': 1, |
|
||||
'': SubRipTime.SECONDS_RATIO, |
|
||||
's': SubRipTime.SECONDS_RATIO, |
|
||||
'm': SubRipTime.MINUTES_RATIO, |
|
||||
'h': SubRipTime.HOURS_RATIO, |
|
||||
} |
|
||||
DESCRIPTION = dedent("""\ |
|
||||
Srt subtitle editor |
|
||||
|
|
||||
It can either shift, split or change the frame rate. |
|
||||
""") |
|
||||
TIMESTAMP_HELP = "A timestamp in the form: [-][Hh][Mm]S[s][MSms]" |
|
||||
SHIFT_EPILOG = dedent("""\ |
|
||||
|
|
||||
Examples: |
|
||||
1 minute and 12 seconds foreward (in place): |
|
||||
$ srt -i shift 1m12s movie.srt |
|
||||
|
|
||||
half a second foreward: |
|
||||
$ srt shift 500ms movie.srt > othername.srt |
|
||||
|
|
||||
1 second and half backward: |
|
||||
$ srt -i shift -1s500ms movie.srt |
|
||||
|
|
||||
3 seconds backward: |
|
||||
$ srt -i shift -3 movie.srt |
|
||||
""") |
|
||||
RATE_EPILOG = dedent("""\ |
|
||||
|
|
||||
Examples: |
|
||||
Convert 23.9fps subtitles to 25fps: |
|
||||
$ srt -i rate 23.9 25 movie.srt |
|
||||
""") |
|
||||
LIMITS_HELP = "Each parts duration in the form: [Hh][Mm]S[s][MSms]" |
|
||||
SPLIT_EPILOG = dedent("""\ |
|
||||
|
|
||||
Examples: |
|
||||
For a movie in 2 parts with the first part 48 minutes and 18 seconds long: |
|
||||
$ srt split 48m18s movie.srt |
|
||||
=> creates movie.1.srt and movie.2.srt |
|
||||
|
|
||||
For a movie in 3 parts of 20 minutes each: |
|
||||
$ srt split 20m 20m movie.srt |
|
||||
=> creates movie.1.srt, movie.2.srt and movie.3.srt |
|
||||
""") |
|
||||
FRAME_RATE_HELP = "A frame rate in fps (commonly 23.9 or 25)" |
|
||||
ENCODING_HELP = dedent("""\ |
|
||||
Change file encoding. Useful for players accepting only latin1 subtitles. |
|
||||
List of supported encodings: http://docs.python.org/library/codecs.html#standard-encodings |
|
||||
""") |
|
||||
BREAK_EPILOG = dedent("""\ |
|
||||
Break lines longer than defined length |
|
||||
""") |
|
||||
LENGTH_HELP = "Maximum number of characters per line" |
|
||||
|
|
||||
def __init__(self): |
|
||||
self.output_file_path = None |
|
||||
|
|
||||
def build_parser(self): |
|
||||
parser = TimeAwareArgumentParser(description=self.DESCRIPTION, formatter_class=argparse.RawTextHelpFormatter) |
|
||||
parser.add_argument('-i', '--in-place', action='store_true', dest='in_place', |
|
||||
help="Edit file in-place, saving a backup as file.bak (do not works for the split command)") |
|
||||
parser.add_argument('-e', '--output-encoding', metavar=underline('encoding'), action='store', dest='output_encoding', |
|
||||
type=self.parse_encoding, help=self.ENCODING_HELP) |
|
||||
parser.add_argument('-v', '--version', action='version', version='%%(prog)s %s' % VERSION_STRING) |
|
||||
subparsers = parser.add_subparsers(title='commands') |
|
||||
|
|
||||
shift_parser = subparsers.add_parser('shift', help="Shift subtitles by specified time offset", epilog=self.SHIFT_EPILOG, formatter_class=argparse.RawTextHelpFormatter) |
|
||||
shift_parser.add_argument('time_offset', action='store', metavar=underline('offset'), |
|
||||
type=self.parse_time, help=self.TIMESTAMP_HELP) |
|
||||
shift_parser.set_defaults(action=self.shift) |
|
||||
|
|
||||
rate_parser = subparsers.add_parser('rate', help="Convert subtitles from a frame rate to another", epilog=self.RATE_EPILOG, formatter_class=argparse.RawTextHelpFormatter) |
|
||||
rate_parser.add_argument('initial', action='store', type=float, help=self.FRAME_RATE_HELP) |
|
||||
rate_parser.add_argument('final', action='store', type=float, help=self.FRAME_RATE_HELP) |
|
||||
rate_parser.set_defaults(action=self.rate) |
|
||||
|
|
||||
split_parser = subparsers.add_parser('split', help="Split a file in multiple parts", epilog=self.SPLIT_EPILOG, formatter_class=argparse.RawTextHelpFormatter) |
|
||||
split_parser.add_argument('limits', action='store', nargs='+', type=self.parse_time, help=self.LIMITS_HELP) |
|
||||
split_parser.set_defaults(action=self.split) |
|
||||
|
|
||||
break_parser = subparsers.add_parser('break', help="Break long lines", epilog=self.BREAK_EPILOG, formatter_class=argparse.RawTextHelpFormatter) |
|
||||
break_parser.add_argument('length', action='store', type=int, help=self.LENGTH_HELP) |
|
||||
break_parser.set_defaults(action=self.break_lines) |
|
||||
|
|
||||
parser.add_argument('file', action='store') |
|
||||
|
|
||||
return parser |
|
||||
|
|
||||
def run(self, args): |
|
||||
self.arguments = self.build_parser().parse_args(args) |
|
||||
if self.arguments.in_place: |
|
||||
self.create_backup() |
|
||||
self.arguments.action() |
|
||||
|
|
||||
def parse_time(self, time_string): |
|
||||
negative = time_string.startswith('-') |
|
||||
if negative: |
|
||||
time_string = time_string[1:] |
|
||||
ordinal = sum(int(value) * self.UNIT_RATIOS[unit] for value, unit |
|
||||
in self.RE_TIME_STRING.findall(time_string)) |
|
||||
return -ordinal if negative else ordinal |
|
||||
|
|
||||
def parse_encoding(self, encoding_name): |
|
||||
try: |
|
||||
codecs.lookup(encoding_name) |
|
||||
except LookupError as error: |
|
||||
raise argparse.ArgumentTypeError(error.message) |
|
||||
return encoding_name |
|
||||
|
|
||||
def shift(self): |
|
||||
self.input_file.shift(milliseconds=self.arguments.time_offset) |
|
||||
self.input_file.write_into(self.output_file) |
|
||||
|
|
||||
def rate(self): |
|
||||
ratio = self.arguments.final / self.arguments.initial |
|
||||
self.input_file.shift(ratio=ratio) |
|
||||
self.input_file.write_into(self.output_file) |
|
||||
|
|
||||
def split(self): |
|
||||
limits = [0] + self.arguments.limits + [self.input_file[-1].end.ordinal + 1] |
|
||||
base_name, extension = os.path.splitext(self.arguments.file) |
|
||||
for index, (start, end) in enumerate(zip(limits[:-1], limits[1:])): |
|
||||
file_name = '%s.%s%s' % (base_name, index + 1, extension) |
|
||||
part_file = self.input_file.slice(ends_after=start, starts_before=end) |
|
||||
part_file.shift(milliseconds=-start) |
|
||||
part_file.clean_indexes() |
|
||||
part_file.save(path=file_name, encoding=self.output_encoding) |
|
||||
|
|
||||
def create_backup(self): |
|
||||
backup_file = self.arguments.file + self.BACKUP_EXTENSION |
|
||||
if not os.path.exists(backup_file): |
|
||||
shutil.copy2(self.arguments.file, backup_file) |
|
||||
self.output_file_path = self.arguments.file |
|
||||
self.arguments.file = backup_file |
|
||||
|
|
||||
def break_lines(self): |
|
||||
split_re = re.compile(r'(.{,%i})(?:\s+|$)' % self.arguments.length) |
|
||||
for item in self.input_file: |
|
||||
item.text = '\n'.join(split_re.split(item.text)[1::2]) |
|
||||
self.input_file.write_into(self.output_file) |
|
||||
|
|
||||
@property |
|
||||
def output_encoding(self): |
|
||||
return self.arguments.output_encoding or self.input_file.encoding |
|
||||
|
|
||||
@property |
|
||||
def input_file(self): |
|
||||
if not hasattr(self, '_source_file'): |
|
||||
with open(self.arguments.file, 'rb') as f: |
|
||||
content = f.read() |
|
||||
encoding = detect(content).get('encoding') |
|
||||
encoding = self.normalize_encoding(encoding) |
|
||||
|
|
||||
self._source_file = SubRipFile.open(self.arguments.file, |
|
||||
encoding=encoding, error_handling=SubRipFile.ERROR_LOG) |
|
||||
return self._source_file |
|
||||
|
|
||||
@property |
|
||||
def output_file(self): |
|
||||
if not hasattr(self, '_output_file'): |
|
||||
if self.output_file_path: |
|
||||
self._output_file = codecs.open(self.output_file_path, 'w+', encoding=self.output_encoding) |
|
||||
else: |
|
||||
self._output_file = sys.stdout |
|
||||
return self._output_file |
|
||||
|
|
||||
def normalize_encoding(self, encoding): |
|
||||
return encoding.lower().replace('-', '_') |
|
||||
|
|
||||
|
|
||||
def main(): |
|
||||
SubRipShifter().run(sys.argv[1:]) |
|
||||
|
|
||||
if __name__ == '__main__': |
|
||||
main() |
|
@ -1,26 +0,0 @@ |
|||||
class ComparableMixin(object): |
|
||||
def _compare(self, other, method): |
|
||||
try: |
|
||||
return method(self._cmpkey(), other._cmpkey()) |
|
||||
except (AttributeError, TypeError): |
|
||||
# _cmpkey not implemented, or return different type, |
|
||||
# so I can't compare with "other". |
|
||||
return NotImplemented |
|
||||
|
|
||||
def __lt__(self, other): |
|
||||
return self._compare(other, lambda s, o: s < o) |
|
||||
|
|
||||
def __le__(self, other): |
|
||||
return self._compare(other, lambda s, o: s <= o) |
|
||||
|
|
||||
def __eq__(self, other): |
|
||||
return self._compare(other, lambda s, o: s == o) |
|
||||
|
|
||||
def __ge__(self, other): |
|
||||
return self._compare(other, lambda s, o: s >= o) |
|
||||
|
|
||||
def __gt__(self, other): |
|
||||
return self._compare(other, lambda s, o: s > o) |
|
||||
|
|
||||
def __ne__(self, other): |
|
||||
return self._compare(other, lambda s, o: s != o) |
|
@ -1,24 +0,0 @@ |
|||||
|
|
||||
import sys |
|
||||
|
|
||||
# Syntax sugar. |
|
||||
_ver = sys.version_info |
|
||||
|
|
||||
#: Python 2.x? |
|
||||
is_py2 = (_ver[0] == 2) |
|
||||
|
|
||||
#: Python 3.x? |
|
||||
is_py3 = (_ver[0] == 3) |
|
||||
|
|
||||
from io import open as io_open |
|
||||
|
|
||||
if is_py2: |
|
||||
builtin_str = str |
|
||||
basestring = basestring |
|
||||
str = unicode |
|
||||
open = io_open |
|
||||
elif is_py3: |
|
||||
builtin_str = str |
|
||||
basestring = (str, bytes) |
|
||||
str = str |
|
||||
open = open |
|
@ -1,31 +0,0 @@ |
|||||
""" |
|
||||
Exception classes |
|
||||
""" |
|
||||
|
|
||||
|
|
||||
class Error(Exception): |
|
||||
""" |
|
||||
Pysrt's base exception |
|
||||
""" |
|
||||
pass |
|
||||
|
|
||||
|
|
||||
class InvalidTimeString(Error): |
|
||||
""" |
|
||||
Raised when parser fail on bad formated time strings |
|
||||
""" |
|
||||
pass |
|
||||
|
|
||||
|
|
||||
class InvalidItem(Error): |
|
||||
""" |
|
||||
Raised when parser fail to parse a sub title item |
|
||||
""" |
|
||||
pass |
|
||||
|
|
||||
|
|
||||
class InvalidIndex(InvalidItem): |
|
||||
""" |
|
||||
Raised when parser fail to parse a sub title index |
|
||||
""" |
|
||||
pass |
|
@ -1,312 +0,0 @@ |
|||||
# -*- coding: utf-8 -*- |
|
||||
import os |
|
||||
import sys |
|
||||
import codecs |
|
||||
|
|
||||
try: |
|
||||
from collections import UserList |
|
||||
except ImportError: |
|
||||
from UserList import UserList |
|
||||
|
|
||||
from itertools import chain |
|
||||
from copy import copy |
|
||||
|
|
||||
from pysrt.srtexc import Error |
|
||||
from pysrt.srtitem import SubRipItem |
|
||||
from pysrt.compat import str |
|
||||
|
|
||||
BOMS = ((codecs.BOM_UTF32_LE, 'utf_32_le'), |
|
||||
(codecs.BOM_UTF32_BE, 'utf_32_be'), |
|
||||
(codecs.BOM_UTF16_LE, 'utf_16_le'), |
|
||||
(codecs.BOM_UTF16_BE, 'utf_16_be'), |
|
||||
(codecs.BOM_UTF8, 'utf_8')) |
|
||||
CODECS_BOMS = dict((codec, str(bom, codec)) for bom, codec in BOMS) |
|
||||
BIGGER_BOM = max(len(bom) for bom, encoding in BOMS) |
|
||||
|
|
||||
|
|
||||
class SubRipFile(UserList, object): |
|
||||
""" |
|
||||
SubRip file descriptor. |
|
||||
|
|
||||
Provide a pure Python mapping on all metadata. |
|
||||
|
|
||||
SubRipFile(items, eol, path, encoding) |
|
||||
|
|
||||
items -> list of SubRipItem. Default to []. |
|
||||
eol -> str: end of line character. Default to linesep used in opened file |
|
||||
if any else to os.linesep. |
|
||||
path -> str: path where file will be saved. To open an existant file see |
|
||||
SubRipFile.open. |
|
||||
encoding -> str: encoding used at file save. Default to utf-8. |
|
||||
""" |
|
||||
ERROR_PASS = 0 |
|
||||
ERROR_LOG = 1 |
|
||||
ERROR_RAISE = 2 |
|
||||
|
|
||||
DEFAULT_ENCODING = 'utf_8' |
|
||||
|
|
||||
def __init__(self, items=None, eol=None, path=None, encoding='utf-8'): |
|
||||
UserList.__init__(self, items or []) |
|
||||
self._eol = eol |
|
||||
self.path = path |
|
||||
self.encoding = encoding |
|
||||
|
|
||||
def _get_eol(self): |
|
||||
return self._eol or os.linesep |
|
||||
|
|
||||
def _set_eol(self, eol): |
|
||||
self._eol = self._eol or eol |
|
||||
|
|
||||
eol = property(_get_eol, _set_eol) |
|
||||
|
|
||||
def slice(self, starts_before=None, starts_after=None, ends_before=None, |
|
||||
ends_after=None): |
|
||||
""" |
|
||||
slice([starts_before][, starts_after][, ends_before][, ends_after]) \ |
|
||||
-> SubRipFile clone |
|
||||
|
|
||||
All arguments are optional, and should be coercible to SubRipTime |
|
||||
object. |
|
||||
|
|
||||
It reduce the set of subtitles to those that match match given time |
|
||||
constraints. |
|
||||
|
|
||||
The returned set is a clone, but still contains references to original |
|
||||
subtitles. So if you shift this returned set, subs contained in the |
|
||||
original SubRipFile instance will be altered too. |
|
||||
|
|
||||
Example: |
|
||||
>>> subs.slice(ends_after={'seconds': 20}).shift(seconds=2) |
|
||||
""" |
|
||||
clone = copy(self) |
|
||||
|
|
||||
if starts_before: |
|
||||
clone.data = (i for i in clone.data if i.start < starts_before) |
|
||||
if starts_after: |
|
||||
clone.data = (i for i in clone.data if i.start > starts_after) |
|
||||
if ends_before: |
|
||||
clone.data = (i for i in clone.data if i.end < ends_before) |
|
||||
if ends_after: |
|
||||
clone.data = (i for i in clone.data if i.end > ends_after) |
|
||||
|
|
||||
clone.data = list(clone.data) |
|
||||
return clone |
|
||||
|
|
||||
def at(self, timestamp=None, **kwargs): |
|
||||
""" |
|
||||
at(timestamp) -> SubRipFile clone |
|
||||
|
|
||||
timestamp argument should be coercible to SubRipFile object. |
|
||||
|
|
||||
A specialization of slice. Return all subtiles visible at the |
|
||||
timestamp mark. |
|
||||
|
|
||||
Example: |
|
||||
>>> subs.at((0, 0, 20, 0)).shift(seconds=2) |
|
||||
>>> subs.at(seconds=20).shift(seconds=2) |
|
||||
""" |
|
||||
time = timestamp or kwargs |
|
||||
return self.slice(starts_before=time, ends_after=time) |
|
||||
|
|
||||
def shift(self, *args, **kwargs): |
|
||||
"""shift(hours, minutes, seconds, milliseconds, ratio) |
|
||||
|
|
||||
Shift `start` and `end` attributes of each items of file either by |
|
||||
applying a ratio or by adding an offset. |
|
||||
|
|
||||
`ratio` should be either an int or a float. |
|
||||
Example to convert subtitles from 23.9 fps to 25 fps: |
|
||||
>>> subs.shift(ratio=25/23.9) |
|
||||
|
|
||||
All "time" arguments are optional and have a default value of 0. |
|
||||
Example to delay all subs from 2 seconds and half |
|
||||
>>> subs.shift(seconds=2, milliseconds=500) |
|
||||
""" |
|
||||
for item in self: |
|
||||
item.shift(*args, **kwargs) |
|
||||
|
|
||||
def clean_indexes(self): |
|
||||
""" |
|
||||
clean_indexes() |
|
||||
|
|
||||
Sort subs and reset their index attribute. Should be called after |
|
||||
destructive operations like split or such. |
|
||||
""" |
|
||||
self.sort() |
|
||||
for index, item in enumerate(self): |
|
||||
item.index = index + 1 |
|
||||
|
|
||||
@property |
|
||||
def text(self): |
|
||||
return '\n'.join(i.text for i in self) |
|
||||
|
|
||||
@classmethod |
|
||||
def open(cls, path='', encoding=None, error_handling=ERROR_PASS): |
|
||||
""" |
|
||||
open([path, [encoding]]) |
|
||||
|
|
||||
If you do not provide any encoding, it can be detected if the file |
|
||||
contain a bit order mark, unless it is set to utf-8 as default. |
|
||||
""" |
|
||||
new_file = cls(path=path, encoding=encoding) |
|
||||
source_file = cls._open_unicode_file(path, claimed_encoding=encoding) |
|
||||
new_file.read(source_file, error_handling=error_handling) |
|
||||
source_file.close() |
|
||||
return new_file |
|
||||
|
|
||||
@classmethod |
|
||||
def from_string(cls, source, **kwargs): |
|
||||
""" |
|
||||
from_string(source, **kwargs) -> SubRipFile |
|
||||
|
|
||||
`source` -> a unicode instance or at least a str instance encoded with |
|
||||
`sys.getdefaultencoding()` |
|
||||
""" |
|
||||
error_handling = kwargs.pop('error_handling', None) |
|
||||
new_file = cls(**kwargs) |
|
||||
new_file.read(source.splitlines(True), error_handling=error_handling) |
|
||||
return new_file |
|
||||
|
|
||||
def read(self, source_file, error_handling=ERROR_PASS): |
|
||||
""" |
|
||||
read(source_file, [error_handling]) |
|
||||
|
|
||||
This method parse subtitles contained in `source_file` and append them |
|
||||
to the current instance. |
|
||||
|
|
||||
`source_file` -> Any iterable that yield unicode strings, like a file |
|
||||
opened with `codecs.open()` or an array of unicode. |
|
||||
""" |
|
||||
self.eol = self._guess_eol(source_file) |
|
||||
self.extend(self.stream(source_file, error_handling=error_handling)) |
|
||||
return self |
|
||||
|
|
||||
@classmethod |
|
||||
def stream(cls, source_file, error_handling=ERROR_PASS): |
|
||||
""" |
|
||||
stream(source_file, [error_handling]) |
|
||||
|
|
||||
This method yield SubRipItem instances a soon as they have been parsed |
|
||||
without storing them. It is a kind of SAX parser for .srt files. |
|
||||
|
|
||||
`source_file` -> Any iterable that yield unicode strings, like a file |
|
||||
opened with `codecs.open()` or an array of unicode. |
|
||||
|
|
||||
Example: |
|
||||
>>> import pysrt |
|
||||
>>> import codecs |
|
||||
>>> file = codecs.open('movie.srt', encoding='utf-8') |
|
||||
>>> for sub in pysrt.stream(file): |
|
||||
... sub.text += "\nHello !" |
|
||||
... print unicode(sub) |
|
||||
""" |
|
||||
string_buffer = [] |
|
||||
for index, line in enumerate(chain(source_file, '\n')): |
|
||||
if line.strip(): |
|
||||
string_buffer.append(line) |
|
||||
else: |
|
||||
source = string_buffer |
|
||||
string_buffer = [] |
|
||||
if source and all(source): |
|
||||
try: |
|
||||
yield SubRipItem.from_lines(source) |
|
||||
except Error as error: |
|
||||
error.args += (''.join(source), ) |
|
||||
cls._handle_error(error, error_handling, index) |
|
||||
|
|
||||
def save(self, path=None, encoding=None, eol=None): |
|
||||
""" |
|
||||
save([path][, encoding][, eol]) |
|
||||
|
|
||||
Use initial path if no other provided. |
|
||||
Use initial encoding if no other provided. |
|
||||
Use initial eol if no other provided. |
|
||||
""" |
|
||||
path = path or self.path |
|
||||
encoding = encoding or self.encoding |
|
||||
|
|
||||
save_file = codecs.open(path, 'w+', encoding=encoding) |
|
||||
self.write_into(save_file, eol=eol) |
|
||||
save_file.close() |
|
||||
|
|
||||
def write_into(self, output_file, eol=None): |
|
||||
""" |
|
||||
write_into(output_file [, eol]) |
|
||||
|
|
||||
Serialize current state into `output_file`. |
|
||||
|
|
||||
`output_file` -> Any instance that respond to `write()`, typically a |
|
||||
file object |
|
||||
""" |
|
||||
output_eol = eol or self.eol |
|
||||
|
|
||||
for item in self: |
|
||||
string_repr = str(item) |
|
||||
if output_eol != '\n': |
|
||||
string_repr = string_repr.replace('\n', output_eol) |
|
||||
output_file.write(string_repr) |
|
||||
# Only add trailing eol if it's not already present. |
|
||||
# It was kept in the SubRipItem's text before but it really |
|
||||
# belongs here. Existing applications might give us subtitles |
|
||||
# which already contain a trailing eol though. |
|
||||
if not string_repr.endswith(2 * output_eol): |
|
||||
output_file.write(output_eol) |
|
||||
|
|
||||
@classmethod |
|
||||
def _guess_eol(cls, string_iterable): |
|
||||
first_line = cls._get_first_line(string_iterable) |
|
||||
for eol in ('\r\n', '\r', '\n'): |
|
||||
if first_line.endswith(eol): |
|
||||
return eol |
|
||||
return os.linesep |
|
||||
|
|
||||
@classmethod |
|
||||
def _get_first_line(cls, string_iterable): |
|
||||
if hasattr(string_iterable, 'tell'): |
|
||||
previous_position = string_iterable.tell() |
|
||||
|
|
||||
try: |
|
||||
first_line = next(iter(string_iterable)) |
|
||||
except StopIteration: |
|
||||
return '' |
|
||||
if hasattr(string_iterable, 'seek'): |
|
||||
string_iterable.seek(previous_position) |
|
||||
|
|
||||
return first_line |
|
||||
|
|
||||
@classmethod |
|
||||
def _detect_encoding(cls, path): |
|
||||
file_descriptor = open(path, 'rb') |
|
||||
first_chars = file_descriptor.read(BIGGER_BOM) |
|
||||
file_descriptor.close() |
|
||||
|
|
||||
for bom, encoding in BOMS: |
|
||||
if first_chars.startswith(bom): |
|
||||
return encoding |
|
||||
|
|
||||
# TODO: maybe a chardet integration |
|
||||
return cls.DEFAULT_ENCODING |
|
||||
|
|
||||
@classmethod |
|
||||
def _open_unicode_file(cls, path, claimed_encoding=None): |
|
||||
encoding = claimed_encoding or cls._detect_encoding(path) |
|
||||
source_file = codecs.open(path, 'rU', encoding=encoding) |
|
||||
|
|
||||
# get rid of BOM if any |
|
||||
possible_bom = CODECS_BOMS.get(encoding, None) |
|
||||
if possible_bom: |
|
||||
file_bom = source_file.read(len(possible_bom)) |
|
||||
if not file_bom == possible_bom: |
|
||||
source_file.seek(0) # if not rewind |
|
||||
return source_file |
|
||||
|
|
||||
@classmethod |
|
||||
def _handle_error(cls, error, error_handling, index): |
|
||||
if error_handling == cls.ERROR_RAISE: |
|
||||
error.args = (index, ) + error.args |
|
||||
raise error |
|
||||
if error_handling == cls.ERROR_LOG: |
|
||||
name = type(error).__name__ |
|
||||
sys.stderr.write('PySRT-%s(line %s): \n' % (name, index)) |
|
||||
sys.stderr.write(error.args[0].encode('ascii', 'replace')) |
|
||||
sys.stderr.write('\n') |
|
@ -1,76 +0,0 @@ |
|||||
# -*- coding: utf-8 -*- |
|
||||
""" |
|
||||
SubRip's subtitle parser |
|
||||
""" |
|
||||
from pysrt.srtexc import InvalidItem, InvalidIndex |
|
||||
from pysrt.srttime import SubRipTime |
|
||||
from pysrt.comparablemixin import ComparableMixin |
|
||||
from pysrt.compat import str |
|
||||
|
|
||||
class SubRipItem(ComparableMixin): |
|
||||
""" |
|
||||
SubRipItem(index, start, end, text, position) |
|
||||
|
|
||||
index -> int: index of item in file. 0 by default. |
|
||||
start, end -> SubRipTime or coercible. |
|
||||
text -> unicode: text content for item. |
|
||||
position -> unicode: raw srt/vtt "display coordinates" string |
|
||||
""" |
|
||||
ITEM_PATTERN = '%s\n%s --> %s%s\n%s\n' |
|
||||
TIMESTAMP_SEPARATOR = '-->' |
|
||||
|
|
||||
def __init__(self, index=0, start=None, end=None, text='', position=''): |
|
||||
try: |
|
||||
self.index = int(index) |
|
||||
except (TypeError, ValueError): # try to cast as int, but it's not mandatory |
|
||||
self.index = index |
|
||||
|
|
||||
self.start = SubRipTime.coerce(start or 0) |
|
||||
self.end = SubRipTime.coerce(end or 0) |
|
||||
self.position = str(position) |
|
||||
self.text = str(text) |
|
||||
|
|
||||
def __str__(self): |
|
||||
position = ' %s' % self.position if self.position.strip() else '' |
|
||||
return self.ITEM_PATTERN % (self.index, self.start, self.end, |
|
||||
position, self.text) |
|
||||
|
|
||||
def _cmpkey(self): |
|
||||
return (self.start, self.end) |
|
||||
|
|
||||
def shift(self, *args, **kwargs): |
|
||||
""" |
|
||||
shift(hours, minutes, seconds, milliseconds, ratio) |
|
||||
|
|
||||
Add given values to start and end attributes. |
|
||||
All arguments are optional and have a default value of 0. |
|
||||
""" |
|
||||
self.start.shift(*args, **kwargs) |
|
||||
self.end.shift(*args, **kwargs) |
|
||||
|
|
||||
@classmethod |
|
||||
def from_string(cls, source): |
|
||||
return cls.from_lines(source.splitlines(True)) |
|
||||
|
|
||||
@classmethod |
|
||||
def from_lines(cls, lines): |
|
||||
if len(lines) < 2: |
|
||||
raise InvalidItem() |
|
||||
lines = [l.rstrip() for l in lines] |
|
||||
index = None |
|
||||
if cls.TIMESTAMP_SEPARATOR not in lines[0]: |
|
||||
index = lines.pop(0) |
|
||||
start, end, position = cls.split_timestamps(lines[0]) |
|
||||
body = '\n'.join(lines[1:]) |
|
||||
return cls(index, start, end, body, position) |
|
||||
|
|
||||
@classmethod |
|
||||
def split_timestamps(cls, line): |
|
||||
timestamps = line.split(cls.TIMESTAMP_SEPARATOR) |
|
||||
if len(timestamps) != 2: |
|
||||
raise InvalidItem() |
|
||||
start, end_and_position = timestamps |
|
||||
end_and_position = end_and_position.lstrip().split(' ', 1) |
|
||||
end = end_and_position[0] |
|
||||
position = end_and_position[1] if len(end_and_position) > 1 else '' |
|
||||
return (s.strip() for s in (start, end, position)) |
|
@ -1,176 +0,0 @@ |
|||||
# -*- coding: utf-8 -*- |
|
||||
""" |
|
||||
SubRip's time format parser: HH:MM:SS,mmm |
|
||||
""" |
|
||||
import re |
|
||||
from datetime import time |
|
||||
|
|
||||
from pysrt.srtexc import InvalidTimeString |
|
||||
from pysrt.comparablemixin import ComparableMixin |
|
||||
from pysrt.compat import str, basestring |
|
||||
|
|
||||
class TimeItemDescriptor(object): |
|
||||
# pylint: disable-msg=R0903 |
|
||||
def __init__(self, ratio, super_ratio=0): |
|
||||
self.ratio = int(ratio) |
|
||||
self.super_ratio = int(super_ratio) |
|
||||
|
|
||||
def _get_ordinal(self, instance): |
|
||||
if self.super_ratio: |
|
||||
return instance.ordinal % self.super_ratio |
|
||||
return instance.ordinal |
|
||||
|
|
||||
def __get__(self, instance, klass): |
|
||||
if instance is None: |
|
||||
raise AttributeError |
|
||||
return self._get_ordinal(instance) // self.ratio |
|
||||
|
|
||||
def __set__(self, instance, value): |
|
||||
part = self._get_ordinal(instance) - instance.ordinal % self.ratio |
|
||||
instance.ordinal += value * self.ratio - part |
|
||||
|
|
||||
|
|
||||
class SubRipTime(ComparableMixin): |
|
||||
TIME_PATTERN = '%02d:%02d:%02d,%03d' |
|
||||
TIME_REPR = 'SubRipTime(%d, %d, %d, %d)' |
|
||||
RE_TIME_SEP = re.compile(r'\:|\.|\,') |
|
||||
RE_INTEGER = re.compile(r'^(\d+)') |
|
||||
SECONDS_RATIO = 1000 |
|
||||
MINUTES_RATIO = SECONDS_RATIO * 60 |
|
||||
HOURS_RATIO = MINUTES_RATIO * 60 |
|
||||
|
|
||||
hours = TimeItemDescriptor(HOURS_RATIO) |
|
||||
minutes = TimeItemDescriptor(MINUTES_RATIO, HOURS_RATIO) |
|
||||
seconds = TimeItemDescriptor(SECONDS_RATIO, MINUTES_RATIO) |
|
||||
milliseconds = TimeItemDescriptor(1, SECONDS_RATIO) |
|
||||
|
|
||||
def __init__(self, hours=0, minutes=0, seconds=0, milliseconds=0): |
|
||||
""" |
|
||||
SubRipTime(hours, minutes, seconds, milliseconds) |
|
||||
|
|
||||
All arguments are optional and have a default value of 0. |
|
||||
""" |
|
||||
super(SubRipTime, self).__init__() |
|
||||
self.ordinal = hours * self.HOURS_RATIO \ |
|
||||
+ minutes * self.MINUTES_RATIO \ |
|
||||
+ seconds * self.SECONDS_RATIO \ |
|
||||
+ milliseconds |
|
||||
|
|
||||
def __repr__(self): |
|
||||
return self.TIME_REPR % tuple(self) |
|
||||
|
|
||||
def __str__(self): |
|
||||
if self.ordinal < 0: |
|
||||
# Represent negative times as zero |
|
||||
return str(SubRipTime.from_ordinal(0)) |
|
||||
return self.TIME_PATTERN % tuple(self) |
|
||||
|
|
||||
def _compare(self, other, method): |
|
||||
return super(SubRipTime, self)._compare(self.coerce(other), method) |
|
||||
|
|
||||
def _cmpkey(self): |
|
||||
return self.ordinal |
|
||||
|
|
||||
def __add__(self, other): |
|
||||
return self.from_ordinal(self.ordinal + self.coerce(other).ordinal) |
|
||||
|
|
||||
def __iadd__(self, other): |
|
||||
self.ordinal += self.coerce(other).ordinal |
|
||||
return self |
|
||||
|
|
||||
def __sub__(self, other): |
|
||||
return self.from_ordinal(self.ordinal - self.coerce(other).ordinal) |
|
||||
|
|
||||
def __isub__(self, other): |
|
||||
self.ordinal -= self.coerce(other).ordinal |
|
||||
return self |
|
||||
|
|
||||
def __mul__(self, ratio): |
|
||||
return self.from_ordinal(int(round(self.ordinal * ratio))) |
|
||||
|
|
||||
def __imul__(self, ratio): |
|
||||
self.ordinal = int(round(self.ordinal * ratio)) |
|
||||
return self |
|
||||
|
|
||||
@classmethod |
|
||||
def coerce(cls, other): |
|
||||
""" |
|
||||
Coerce many types to SubRipTime instance. |
|
||||
Supported types: |
|
||||
- str/unicode |
|
||||
- int/long |
|
||||
- datetime.time |
|
||||
- any iterable |
|
||||
- dict |
|
||||
""" |
|
||||
if isinstance(other, SubRipTime): |
|
||||
return other |
|
||||
if isinstance(other, basestring): |
|
||||
return cls.from_string(other) |
|
||||
if isinstance(other, int): |
|
||||
return cls.from_ordinal(other) |
|
||||
if isinstance(other, time): |
|
||||
return cls.from_time(other) |
|
||||
try: |
|
||||
return cls(**other) |
|
||||
except TypeError: |
|
||||
return cls(*other) |
|
||||
|
|
||||
def __iter__(self): |
|
||||
yield self.hours |
|
||||
yield self.minutes |
|
||||
yield self.seconds |
|
||||
yield self.milliseconds |
|
||||
|
|
||||
def shift(self, *args, **kwargs): |
|
||||
""" |
|
||||
shift(hours, minutes, seconds, milliseconds) |
|
||||
|
|
||||
All arguments are optional and have a default value of 0. |
|
||||
""" |
|
||||
if 'ratio' in kwargs: |
|
||||
self *= kwargs.pop('ratio') |
|
||||
self += self.__class__(*args, **kwargs) |
|
||||
|
|
||||
@classmethod |
|
||||
def from_ordinal(cls, ordinal): |
|
||||
""" |
|
||||
int -> SubRipTime corresponding to a total count of milliseconds |
|
||||
""" |
|
||||
return cls(milliseconds=int(ordinal)) |
|
||||
|
|
||||
@classmethod |
|
||||
def from_string(cls, source): |
|
||||
""" |
|
||||
str/unicode(HH:MM:SS,mmm) -> SubRipTime corresponding to serial |
|
||||
raise InvalidTimeString |
|
||||
""" |
|
||||
items = cls.RE_TIME_SEP.split(source) |
|
||||
if len(items) != 4: |
|
||||
raise InvalidTimeString |
|
||||
return cls(*(cls.parse_int(i) for i in items)) |
|
||||
|
|
||||
@classmethod |
|
||||
def parse_int(cls, digits): |
|
||||
try: |
|
||||
return int(digits) |
|
||||
except ValueError: |
|
||||
match = cls.RE_INTEGER.match(digits) |
|
||||
if match: |
|
||||
return int(match.group()) |
|
||||
return 0 |
|
||||
|
|
||||
@classmethod |
|
||||
def from_time(cls, source): |
|
||||
""" |
|
||||
datetime.time -> SubRipTime corresponding to time object |
|
||||
""" |
|
||||
return cls(hours=source.hour, minutes=source.minute, |
|
||||
seconds=source.second, milliseconds=source.microsecond // 1000) |
|
||||
|
|
||||
def to_time(self): |
|
||||
""" |
|
||||
Convert SubRipTime instance into a pure datetime.time object |
|
||||
""" |
|
||||
return time(self.hours, self.minutes, self.seconds, |
|
||||
self.milliseconds * 1000) |
|
@ -1,2 +0,0 @@ |
|||||
VERSION = (1, 0, 1) |
|
||||
VERSION_STRING = '.'.join(str(i) for i in VERSION) |
|
Loading…
Reference in new issue