From 22f1d2f642eaddedc74e578549083d0816f2e0d4 Mon Sep 17 00:00:00 2001 From: Safihre Date: Sun, 2 May 2021 10:16:10 +0200 Subject: [PATCH] Stop scanning the par2 file once we have the information of all files --- sabnzbd/par2file.py | 64 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/sabnzbd/par2file.py b/sabnzbd/par2file.py index 9aa08ba..9463b3d 100644 --- a/sabnzbd/par2file.py +++ b/sabnzbd/par2file.py @@ -23,12 +23,15 @@ import logging import os import re import struct -from typing import Dict, Optional, Tuple +from typing import Dict, Optional, Tuple, BinaryIO +from sabnzbd.constants import MEBI from sabnzbd.encoding import correct_unknown_encoding PROBABLY_PAR2_RE = re.compile(r"(.*)\.vol(\d*)[+\-](\d*)\.par2", re.I) +SCAN_LIMIT = 10 * MEBI PAR_PKT_ID = b"PAR2\x00PKT" +PAR_MAIN_ID = b"PAR 2.0\x00Main\x00\x00\x00\x00" PAR_FILE_ID = b"PAR 2.0\x00FileDesc" PAR_CREATOR_ID = b"PAR 2.0\x00Creator\x00" PAR_RECOVERY_ID = b"RecvSlic" @@ -91,17 +94,17 @@ def parse_par2_file(fname: str, md5of16k: Dict[bytes, str]) -> Dict[str, bytes]: For a full description of the par2 specification, visit: http://parchive.sourceforge.net/docs/specifications/parity-volume-spec/article-spec.html """ + total_size = os.path.getsize(fname) table = {} duplicates16k = [] + total_nr_files = None try: with open(fname, "rb") as f: header = f.read(8) while header: - name, filehash, hash16k = parse_par2_file_packet(f, header) + name, filehash, hash16k, nr_files = parse_par2_packet(f, header) if name: - if name in table: - break table[name] = filehash if hash16k not in md5of16k: md5of16k[hash16k] = name @@ -110,6 +113,15 @@ def parse_par2_file(fname: str, md5of16k: Dict[bytes, str]) -> Dict[str, bytes]: # Remove to avoid false-renames duplicates16k.append(hash16k) + # Store the number of files for later + if nr_files: + total_nr_files = nr_files + + # On large files, we stop after seeing all the listings + # On smaller files, we scan them fully to get the par2-creator + if total_size > SCAN_LIMIT and len(table) == total_nr_files: + break + header = f.read(8) except (struct.error, IndexError): @@ -131,10 +143,12 @@ def parse_par2_file(fname: str, md5of16k: Dict[bytes, str]) -> Dict[str, bytes]: return table -def parse_par2_file_packet(f, header) -> Tuple[Optional[str], Optional[bytes], Optional[bytes]]: - """Look up and analyze a FileDesc package""" +def parse_par2_packet( + f: BinaryIO, header: bytes +) -> Tuple[Optional[str], Optional[bytes], Optional[bytes], Optional[int]]: + """Look up and analyze a PAR2 packet""" - nothing = None, None, None + filename, filehash, hash16k, nr_files = nothing = None, None, None, None # All packages start with a header before the body # 8 : PAR2\x00PKT @@ -163,27 +177,31 @@ def parse_par2_file_packet(f, header) -> Tuple[Optional[str], Optional[bytes], O if md5sum != md5.digest(): return nothing - # The FileDesc packet looks like: - # 16 : "PAR 2.0\0FileDesc" - # 16 : FileId - # 16 : Hash for full file ** - # 16 : Hash for first 16K - # 8 : File length - # xx : Name (multiple of 4, padded with \0 if needed) ** - - # See if it's the right packet and get name + hash + # See if it's any of the packages we care about offset = 16 - par2id = data[offset : offset + 16] - - if par2id == PAR_FILE_ID: + par2_packet_type = data[offset : offset + 16] + + if par2_packet_type == PAR_FILE_ID: + # The FileDesc packet looks like: + # 16 : "PAR 2.0\0FileDesc" + # 16 : FileId + # 16 : Hash for full file + # 16 : Hash for first 16K + # 8 : File length + # xx : Name (multiple of 4, padded with \0 if needed) filehash = data[offset + 32 : offset + 48] hash16k = data[offset + 48 : offset + 64] filename = correct_unknown_encoding(data[offset + 72 :].strip(b"\0")) - return filename, filehash, hash16k - elif par2id == PAR_CREATOR_ID: + elif par2_packet_type == PAR_CREATOR_ID: # From here until the end is the creator-text # Useful in case of bugs in the par2-creating software par2creator = data[offset + 16 :].strip(b"\0") # Remove any trailing \0 logging.debug("Par2-creator of %s is: %s", os.path.basename(f.name), correct_unknown_encoding(par2creator)) - - return nothing + elif par2_packet_type == PAR_MAIN_ID: + # The Main packet looks like: + # 16 : "PAR 2.0\0Main" + # 8 : Slice size + # 4 : Number of files in the recovery set + nr_files = struct.unpack("