Stop scanning the par2 file once we have the information of all files

4 years ago · 22f1d2f642
1 changed files with 41 additions and 23 deletions
--- a/sabnzbd/par2file.py
+++ b/sabnzbd/par2file.py
@ -23,12 +23,15 @@ import logging
 import os
 import re
 import struct
-from typing import Dict, Optional, Tuple
+from typing import Dict, Optional, Tuple, BinaryIO

+from sabnzbd.constants import MEBI
 from sabnzbd.encoding import correct_unknown_encoding

 PROBABLY_PAR2_RE = re.compile(r"(.*)\.vol(\d*)[+\-](\d*)\.par2", re.I)
+SCAN_LIMIT = 10 * MEBI
 PAR_PKT_ID = b"PAR2\x00PKT"
+PAR_MAIN_ID = b"PAR 2.0\x00Main\x00\x00\x00\x00"
 PAR_FILE_ID = b"PAR 2.0\x00FileDesc"
 PAR_CREATOR_ID = b"PAR 2.0\x00Creator\x00"
 PAR_RECOVERY_ID = b"RecvSlic"
@ -91,17 +94,17 @@ def parse_par2_file(fname: str, md5of16k: Dict[bytes, str]) -> Dict[str, bytes]:
    For a full description of the par2 specification, visit:
    http://parchive.sourceforge.net/docs/specifications/parity-volume-spec/article-spec.html
    """
+    total_size = os.path.getsize(fname)
    table = {}
    duplicates16k = []
+    total_nr_files = None

    try:
        with open(fname, "rb") as f:
            header = f.read(8)
            while header:
-                name, filehash, hash16k = parse_par2_file_packet(f, header)
+                name, filehash, hash16k, nr_files = parse_par2_packet(f, header)
                if name:
-                    if name in table:
-                        break
                    table[name] = filehash
                    if hash16k not in md5of16k:
                        md5of16k[hash16k] = name
@ -110,6 +113,15 @@ def parse_par2_file(fname: str, md5of16k: Dict[bytes, str]) -> Dict[str, bytes]:
                        # Remove to avoid false-renames
                        duplicates16k.append(hash16k)

+                # Store the number of files for later
+                if nr_files:
+                    total_nr_files = nr_files
+
+                # On large files, we stop after seeing all the listings
+                # On smaller files, we scan them fully to get the par2-creator
+                if total_size > SCAN_LIMIT and len(table) == total_nr_files:
+                    break
+
                header = f.read(8)

    except (struct.error, IndexError):
@ -131,10 +143,12 @@ def parse_par2_file(fname: str, md5of16k: Dict[bytes, str]) -> Dict[str, bytes]:
    return table


-def parse_par2_file_packet(f, header) -> Tuple[Optional[str], Optional[bytes], Optional[bytes]]:
-    """Look up and analyze a FileDesc package"""
+def parse_par2_packet(
+    f: BinaryIO, header: bytes
+) -> Tuple[Optional[str], Optional[bytes], Optional[bytes], Optional[int]]:
+    """Look up and analyze a PAR2 packet"""

-    nothing = None, None, None
+    filename, filehash, hash16k, nr_files = nothing = None, None, None, None

    # All packages start with a header before the body
    # 8	  : PAR2\x00PKT
@ -163,27 +177,31 @@ def parse_par2_file_packet(f, header) -> Tuple[Optional[str], Optional[bytes], O
    if md5sum != md5.digest():
        return nothing

-    # The FileDesc packet looks like:
-    # 16 : "PAR 2.0\0FileDesc"
-    # 16 : FileId
-    # 16 : Hash for full file **
-    # 16 : Hash for first 16K
-    #  8 : File length
-    # xx : Name (multiple of 4, padded with \0 if needed) **
-
-    # See if it's the right packet and get name + hash
+    # See if it's any of the packages we care about
    offset = 16
-    par2id = data[offset : offset + 16]
-
-    if par2id == PAR_FILE_ID:
+    par2_packet_type = data[offset : offset + 16]
+
+    if par2_packet_type == PAR_FILE_ID:
+        # The FileDesc packet looks like:
+        # 16 : "PAR 2.0\0FileDesc"
+        # 16 : FileId
+        # 16 : Hash for full file
+        # 16 : Hash for first 16K
+        #  8 : File length
+        # xx : Name (multiple of 4, padded with \0 if needed)
        filehash = data[offset + 32 : offset + 48]
        hash16k = data[offset + 48 : offset + 64]
        filename = correct_unknown_encoding(data[offset + 72 :].strip(b"\0"))
-        return filename, filehash, hash16k
-    elif par2id == PAR_CREATOR_ID:
+    elif par2_packet_type == PAR_CREATOR_ID:
        # From here until the end is the creator-text
        # Useful in case of bugs in the par2-creating software
        par2creator = data[offset + 16 :].strip(b"\0")  # Remove any trailing \0
        logging.debug("Par2-creator of %s is: %s", os.path.basename(f.name), correct_unknown_encoding(par2creator))
-
-    return nothing
+    elif par2_packet_type == PAR_MAIN_ID:
+        # The Main packet looks like:
+        # 16 : "PAR 2.0\0Main"
+        # 8  : Slice size
+        # 4  : Number of files in the recovery set
+        nr_files = struct.unpack("<I", data[offset + 24 : offset + 28])[0]
+
+    return filename, filehash, hash16k, nr_files