SickGear/lib/hachoir_py3/subfile/search.py

from hachoir_py3.stream import InputSubStream
from hachoir_py3.core.tools import humanFilesize, humanDuration
from hachoir_py3.core.memory import limitedMemory
from hachoir_py3.subfile.data_rate import DataRate
from hachoir_py3.subfile.output import Output
from hachoir_py3.subfile.pattern import HachoirPatternMatching as PatternMatching
from sys import stderr
from time import time


def skipSubfile(parser):
    subfile = parser.getParserTags().get("subfile", "")
    return (subfile == "skip")


FILE_MAX_SIZE = 100 * 1024 * 1024   # Max. file size in bytes (100 MB)
SLICE_SIZE = 64 * 1024                # Slice size in bytes (64 KB)
MEMORY_LIMIT = 50 * 1024 * 1024
PROGRESS_UPDATE = 1.5   # Minimum number of second between two progress messages


class SearchSubfile:
    """
    Tool to find file start and file size in any binary stream.

    To use it:
    - instanciate the class: subfile = SearchSubfile()
    - (optional) choose magics with: subfile.loadMagics(categories, parser_ids)
    - run the search: subfile.main()
    """

    def __init__(self, stream, offset=0, size=None):
        """
        Setup search tool, parameter:
         - filename: Input filename in locale charset
         - directory: Directory filename in locale charset where
           output files will be written
         - offset: Offset (in bytes) of the beginning of the search
         - size: Limit size (in bytes) of input file (None: no limit)
         - debug: Debug mode flag (display debug information)
        """

        # Size
        self.stream = stream
        if size is not None:
            self.size = min(self.stream.size, (offset + size) * 8)
        else:
            self.size = self.stream.size

        # Offset
        self.start_offset = offset * 8
        self.current_offset = self.start_offset
        self.slice_size = SLICE_SIZE * 8   # 64 KB (in bits)

        # Statistics
        self.datarate = DataRate(self.start_offset)
        self.main_start = time()

        # Other flags and attributes
        self.patterns = None
        self.verbose = True
        self.debug = False
        self.output = None
        self.filter = None

    def setOutput(self, directory):
        self.output = Output(directory)

    def loadParsers(self, categories=None, parser_ids=None):
        before = time()
        self.patterns = PatternMatching(categories, parser_ids)
        if self.debug:
            print("Regex compilation: %.1f ms" % ((time() - before) * 1000))
            print("Use regex: %s" % self.patterns)

    def main(self):
        """
        Run the search.
        Return True if ok, False otherwise.
        """

        # Initialize
        self.mainHeader()

        # Prepare search
        main_error = False
        try:
            # Run search
            limitedMemory(MEMORY_LIMIT, self.searchSubfiles)
        except KeyboardInterrupt:
            print("[!] Program interrupted (CTRL+C)", file=stderr)
            main_error = True
        except MemoryError:
            main_error = True
            print("[!] Memory error!", file=stderr)
        self.mainFooter()
        self.stream.close()
        return not(main_error)

    def mainHeader(self):
        # Fix slice size if needed
        self.slice_size = max(self.slice_size, self.patterns.max_length * 8)

        # Load parsers if none has been choosen
        if not self.patterns:
            self.loadParsers()

        bytes = (self.size - self.start_offset) // 8
        print("[+] Start search on %s bytes (%s)" % (
            bytes, humanFilesize(bytes)), file=stderr)
        print(file=stderr)
        self.stats = {}
        self.current_offset = self.start_offset
        self.main_start = time()

    def mainFooter(self):
        print(file=stderr)
        print("[+] End of search -- offset=%s (%s)" % (
            self.current_offset // 8, humanFilesize(self.current_offset // 8)), file=stderr)
        size = (self.current_offset - self.start_offset) // 8
        duration = time() - self.main_start
        if 0.1 <= duration:
            print("Total time: %s -- global rate: %s/sec" % (
                humanDuration(duration * 1000), humanFilesize(size // duration)), file=stderr)

    def searchSubfiles(self):
        """
        Search all subfiles in the stream, call processParser() for each parser.
        """
        self.next_offset = None
        self.next_progress = time() + PROGRESS_UPDATE
        while self.current_offset < self.size:
            self.datarate.update(self.current_offset)
            if self.verbose and self.next_progress <= time():
                self.displayProgress()
            for offset, parser in self.findMagic(self.current_offset):
                self.processParser(offset, parser)
            self.current_offset += self.slice_size
            if self.next_offset:
                self.current_offset = max(
                    self.current_offset, self.next_offset)
            self.current_offset = min(self.current_offset, self.size)

    def processParser(self, offset, parser):
        """
        Process a valid parser.
        """
        text = "[+] File at %s" % (offset // 8)
        if parser.content_size is not None:
            text += " size=%s (%s)" % (parser.content_size //
                                       8, humanFilesize(parser.content_size // 8))
        if not(parser.content_size) or parser.content_size // 8 < FILE_MAX_SIZE:
            text += ": " + parser.description
        else:
            text += ": " + parser.__class__.__name__

        if self.output and parser.content_size:
            if (offset == 0 and parser.content_size == self.size):
                text += " (don't copy whole file)"
            elif parser.content_size // 8 >= FILE_MAX_SIZE:
                text += " (don't copy file, too big)"
            elif not self.filter or self.filter(parser):
                filename = self.output.createFilename(parser.filename_suffix)
                filename = self.output.writeFile(
                    filename, self.stream, offset, parser.content_size)
                text += " => %s" % filename
        print(text)
        self.next_progress = time() + PROGRESS_UPDATE

    def findMagic(self, offset):
        """
        Find all 'magic_str' strings in stream in offset interval:
          offset..(offset+self.slice_size).

        The function returns a generator with values (offset, parser) where
        offset is beginning of a file (relative to stream begin), and not the
        position of the magic.
        """
        start = offset
        end = start + self.slice_size
        end = min(end, self.size)
        data = self.stream.readBytes(start, (end - start) // 8)
        for parser_cls, offset in self.patterns.search(data):
            offset += start
            # Skip invalid offset
            if offset < 0:
                continue
            if self.next_offset and offset < self.next_offset:
                continue

            # Create parser at found offset
            parser = self.guess(offset, parser_cls)

            # Update statistics
            if parser_cls not in self.stats:
                self.stats[parser_cls] = [0, 0]
            self.stats[parser_cls][0] += 1
            if not parser:
                continue

            # Parser is valid, yield it with the offset
            self.stats[parser_cls][1] += 1

            if self.debug:
                print("Found %s at offset %s" % (
                    parser.__class__.__name__, offset // 8), file=stderr)
            yield (offset, parser)

            # Set next offset
            if parser.content_size is not None\
                    and skipSubfile(parser):
                self.next_offset = offset + parser.content_size
                if end <= self.next_offset:
                    break

    def guess(self, offset, parser_cls):
        """
        Try the specified parser at stream offset 'offset'.

        Return the parser object, or None on failure.
        """
        substream = InputSubStream(self.stream, offset)
        try:
            return parser_cls(substream, validate=True)
        except Exception:
            return None

    def displayProgress(self):
        """
        Display progress (to stdout) of the whole process.
        Compute data rate (in byte per sec) and time estimation.
        """
        # Program next update
        self.next_progress = time() + PROGRESS_UPDATE

        # Progress offset
        percent = float(self.current_offset - self.start_offset) * \
            100 / (self.size - self.start_offset)
        offset = self.current_offset // 8
        message = "Search: %.2f%% -- offset=%u (%s)" % (
            percent, offset, humanFilesize(offset))

        # Compute data rate (byte/sec)
        average = self.datarate.average
        if average:
            message += " -- %s/sec " % humanFilesize(average // 8)
            eta = float(self.size - self.current_offset) / average
            message += " -- ETA: %s" % humanDuration(eta * 1000)

        # Display message
        print(message, file=stderr)
Change core system to improve performance and facilitate multi TV info sources. Change migrate core objects TVShow and TVEpisode and everywhere that these objects affect. Add message to logs and disable ui backlog buttons when no media provider has active and/or scheduled searching enabled. Change views for py3 compat. Change set default runtime of 5 mins if none is given for layout Day by Day. Add OpenSubtitles authentication support to config/Subtitles/Subtitles Plugin. Add "Enforce media hash match" to config/Subtitles Plugin/Opensubtitles for accurate subs if enabled, but if disabled, search failures will fallback to use less reliable subtitle results. Add Apprise 0.8.0 (6aa52c3). Add hachoir_py3 3.0a6 (5b9e05a). Add sgmllib3k 1.0.0 Update soupsieve 1.9.1 (24859cc) to soupsieve_py2 1.9.5 (6a38398) Add soupsieve_py3 2.0.0.dev (69194a2). Add Tornado_py3 Web Server 6.0.3 (ff985fe). Add xmlrpclib_to 0.1.1 (c37db9e). Remove ancient Growl lib 0.1 Remove xmltodict library. Change requirements.txt for Cheetah3 to minimum 3.2.4 Change update sabToSickBeard. Change update autoProcessTV. Change remove Twitter notifier. Update NZBGet Process Media extension, SickGear-NG 1.7 → 2.4 Update Kodi addon 1.0.3 → 1.0.4 Update ADBA for py3. Update Beautiful Soup 4.8.0 (r526) to 4.8.1 (r531). Update Send2Trash 1.3.0 (a568370) to 1.5.0 (66afce7). Update soupsieve 1.9.1 (24859cc) to 1.9.5 (6a38398). Change use GNTP (Growl Notification Transport Protocol) from Apprise. Change add multi host support to Growl notifier. Fix Growl notifier when using empty password. Change update links for Growl notifications. Change deprecate confg/Notifications/Growl password field as these are now stored with host setting. Fix prevent infinite memoryError from a particular jpg data structure. Change subliminal for py3. Change enzyme for py3. Change browser_ua for py3. Change feedparser for py3 (sgmlib is no longer available on py3 as standardlib so added ext lib) Fix Guessit. Fix parse_xml for py3. Fix name parser with multi eps for py3. Fix tvdb_api fixes for py3 (search show). Fix config/media process to only display "pattern is invalid" qtip on "Episode naming" tab if the associated field is actually visible. Also, if the field becomes hidden due to a setting change, hide any previously displayed qtip. Note for Javascript::getelementbyid (or $('tag[id="<name>"')) is required when an id is being searched in the dom due to ":" used in a shows id name. Change download anidb xml files to main cache folder and use adba lib folder as a last resort. Change create get anidb show groups as centralised helper func and consolidate dupe code. Change move anidb related functions to newly renamed anime.py (from blacklistandwhitelist.py). Change str encode hex no longer exits in py3, use codecs.encode(...) instead. Change fix b64decode on py3 returns bytestrings. Change use binary read when downloading log file via browser to prevent any encoding issues. Change add case insensitive ordering to anime black/whitelist. Fix anime groups list not excluding whitelisted stuff. Change add Windows utf8 fix ... see: ytdl-org/youtube-dl#820 Change if no qualities are wanted, exit manual search thread. Fix keepalive for py3 process media. Change add a once a month update of tvinfo show mappings to the daily updater. Change autocorrect ids of new shows by updating from -8 to 31 days of the airdate of episode one. Add next run time to Manage/Show Tasks/Daily show update. Change when fetching imdb data, if imdb id is an episode id then try to find and use real show id. Change delete diskcache db in imdbpie when value error (due to change in Python version). Change during startup, cleanup any _cleaner.pyc/o to prevent issues when switching python versions. Add .pyc cleaner if python version is switched. Change replace deprecated gettz_db_metadata() and gettz. Change rebrand "SickGear PostProcessing script" to "SickGear Process Media extension". Change improve setup guide to use the NZBGet version to minimise displayed text based on version. Change NZBGet versions prior to v17 now told to upgrade as those version are no longer supported - code has actually exit on start up for some time but docs were outdated. Change comment out code and unused option sg_base_path. Change supported Python version 2.7.9-2.7.18 inclusive expanded to 3.7.1-3.8.1 inclusive. Change pidfile creation under Linux 0o644. Make logger accept lists to output continuously using the log_lock instead of split up by other processes. Fix long path issues with Windows process media. 6 years ago			`from hachoir_py3.stream import InputSubStream`
			`from hachoir_py3.core.tools import humanFilesize, humanDuration`
			`from hachoir_py3.core.memory import limitedMemory`
			`from hachoir_py3.subfile.data_rate import DataRate`
			`from hachoir_py3.subfile.output import Output`
			`from hachoir_py3.subfile.pattern import HachoirPatternMatching as PatternMatching`
			`from sys import stderr`
			`from time import time`


			`def skipSubfile(parser):`
			`subfile = parser.getParserTags().get("subfile", "")`
			`return (subfile == "skip")`


			`FILE_MAX_SIZE = 100 * 1024 * 1024 # Max. file size in bytes (100 MB)`
			`SLICE_SIZE = 64 * 1024 # Slice size in bytes (64 KB)`
			`MEMORY_LIMIT = 50 * 1024 * 1024`
			`PROGRESS_UPDATE = 1.5 # Minimum number of second between two progress messages`


			`class SearchSubfile:`
			`"""`
			`Tool to find file start and file size in any binary stream.`

			`To use it:`
			`- instanciate the class: subfile = SearchSubfile()`
			`- (optional) choose magics with: subfile.loadMagics(categories, parser_ids)`
			`- run the search: subfile.main()`
			`"""`

			`def __init__(self, stream, offset=0, size=None):`
			`"""`
			`Setup search tool, parameter:`
			`- filename: Input filename in locale charset`
			`- directory: Directory filename in locale charset where`
			`output files will be written`
			`- offset: Offset (in bytes) of the beginning of the search`
			`- size: Limit size (in bytes) of input file (None: no limit)`
			`- debug: Debug mode flag (display debug information)`
			`"""`

			`# Size`
			`self.stream = stream`
			`if size is not None:`
			`self.size = min(self.stream.size, (offset + size) * 8)`
			`else:`
			`self.size = self.stream.size`

			`# Offset`
			`self.start_offset = offset * 8`
			`self.current_offset = self.start_offset`
			`self.slice_size = SLICE_SIZE * 8 # 64 KB (in bits)`

			`# Statistics`
			`self.datarate = DataRate(self.start_offset)`
			`self.main_start = time()`

			`# Other flags and attributes`
			`self.patterns = None`
			`self.verbose = True`
			`self.debug = False`
			`self.output = None`
			`self.filter = None`

			`def setOutput(self, directory):`
			`self.output = Output(directory)`

			`def loadParsers(self, categories=None, parser_ids=None):`
			`before = time()`
			`self.patterns = PatternMatching(categories, parser_ids)`
			`if self.debug:`
			`print("Regex compilation: %.1f ms" % ((time() - before) * 1000))`
			`print("Use regex: %s" % self.patterns)`

			`def main(self):`
			`"""`
			`Run the search.`
			`Return True if ok, False otherwise.`
			`"""`

			`# Initialize`
			`self.mainHeader()`

			`# Prepare search`
			`main_error = False`
			`try:`
			`# Run search`
			`limitedMemory(MEMORY_LIMIT, self.searchSubfiles)`
			`except KeyboardInterrupt:`
			`print("[!] Program interrupted (CTRL+C)", file=stderr)`
			`main_error = True`
			`except MemoryError:`
			`main_error = True`
			`print("[!] Memory error!", file=stderr)`
			`self.mainFooter()`
			`self.stream.close()`
			`return not(main_error)`

			`def mainHeader(self):`
			`# Fix slice size if needed`
			`self.slice_size = max(self.slice_size, self.patterns.max_length * 8)`

			`# Load parsers if none has been choosen`
			`if not self.patterns:`
			`self.loadParsers()`

			`bytes = (self.size - self.start_offset) // 8`
			`print("[+] Start search on %s bytes (%s)" % (`
			`bytes, humanFilesize(bytes)), file=stderr)`
			`print(file=stderr)`
			`self.stats = {}`
			`self.current_offset = self.start_offset`
			`self.main_start = time()`

			`def mainFooter(self):`
			`print(file=stderr)`
			`print("[+] End of search -- offset=%s (%s)" % (`
			`self.current_offset // 8, humanFilesize(self.current_offset // 8)), file=stderr)`
			`size = (self.current_offset - self.start_offset) // 8`
			`duration = time() - self.main_start`
			`if 0.1 <= duration:`
			`print("Total time: %s -- global rate: %s/sec" % (`
			`humanDuration(duration * 1000), humanFilesize(size // duration)), file=stderr)`

			`def searchSubfiles(self):`
			`"""`
			`Search all subfiles in the stream, call processParser() for each parser.`
			`"""`
			`self.next_offset = None`
			`self.next_progress = time() + PROGRESS_UPDATE`
			`while self.current_offset < self.size:`
			`self.datarate.update(self.current_offset)`
			`if self.verbose and self.next_progress <= time():`
			`self.displayProgress()`
			`for offset, parser in self.findMagic(self.current_offset):`
			`self.processParser(offset, parser)`
			`self.current_offset += self.slice_size`
			`if self.next_offset:`
			`self.current_offset = max(`
			`self.current_offset, self.next_offset)`
			`self.current_offset = min(self.current_offset, self.size)`

			`def processParser(self, offset, parser):`
			`"""`
			`Process a valid parser.`
			`"""`
			`text = "[+] File at %s" % (offset // 8)`
			`if parser.content_size is not None:`
			`text += " size=%s (%s)" % (parser.content_size //`
			`8, humanFilesize(parser.content_size // 8))`
			`if not(parser.content_size) or parser.content_size // 8 < FILE_MAX_SIZE:`
			`text += ": " + parser.description`
			`else:`
			`text += ": " + parser.__class__.__name__`

			`if self.output and parser.content_size:`
			`if (offset == 0 and parser.content_size == self.size):`
			`text += " (don't copy whole file)"`
			`elif parser.content_size // 8 >= FILE_MAX_SIZE:`
			`text += " (don't copy file, too big)"`
			`elif not self.filter or self.filter(parser):`
			`filename = self.output.createFilename(parser.filename_suffix)`
			`filename = self.output.writeFile(`
			`filename, self.stream, offset, parser.content_size)`
			`text += " => %s" % filename`
			`print(text)`
			`self.next_progress = time() + PROGRESS_UPDATE`

			`def findMagic(self, offset):`
			`"""`
			`Find all 'magic_str' strings in stream in offset interval:`
			`offset..(offset+self.slice_size).`

			`The function returns a generator with values (offset, parser) where`
			`offset is beginning of a file (relative to stream begin), and not the`
			`position of the magic.`
			`"""`
			`start = offset`
			`end = start + self.slice_size`
			`end = min(end, self.size)`
			`data = self.stream.readBytes(start, (end - start) // 8)`
			`for parser_cls, offset in self.patterns.search(data):`
			`offset += start`
			`# Skip invalid offset`
			`if offset < 0:`
			`continue`
			`if self.next_offset and offset < self.next_offset:`
			`continue`

			`# Create parser at found offset`
			`parser = self.guess(offset, parser_cls)`

			`# Update statistics`
			`if parser_cls not in self.stats:`
			`self.stats[parser_cls] = [0, 0]`
			`self.stats[parser_cls][0] += 1`
			`if not parser:`
			`continue`

			`# Parser is valid, yield it with the offset`
			`self.stats[parser_cls][1] += 1`

			`if self.debug:`
			`print("Found %s at offset %s" % (`
			`parser.__class__.__name__, offset // 8), file=stderr)`
			`yield (offset, parser)`

			`# Set next offset`
			`if parser.content_size is not None\`
			`and skipSubfile(parser):`
			`self.next_offset = offset + parser.content_size`
			`if end <= self.next_offset:`
			`break`

			`def guess(self, offset, parser_cls):`
			`"""`
			`Try the specified parser at stream offset 'offset'.`

			`Return the parser object, or None on failure.`
			`"""`
			`substream = InputSubStream(self.stream, offset)`
			`try:`
			`return parser_cls(substream, validate=True)`
			`except Exception:`
			`return None`

			`def displayProgress(self):`
			`"""`
			`Display progress (to stdout) of the whole process.`
			`Compute data rate (in byte per sec) and time estimation.`
			`"""`
			`# Program next update`
			`self.next_progress = time() + PROGRESS_UPDATE`

			`# Progress offset`
			`percent = float(self.current_offset - self.start_offset) * \`
			`100 / (self.size - self.start_offset)`
			`offset = self.current_offset // 8`
			`message = "Search: %.2f%% -- offset=%u (%s)" % (`
			`percent, offset, humanFilesize(offset))`

			`# Compute data rate (byte/sec)`
			`average = self.datarate.average`
			`if average:`
			`message += " -- %s/sec " % humanFilesize(average // 8)`
			`eta = float(self.size - self.current_offset) / average`
			`message += " -- ETA: %s" % humanDuration(eta * 1000)`

			`# Display message`
			`print(message, file=stderr)`