Browse Source

Stop scanning the par2 file once we have the information of all files

pull/1885/head
Safihre 4 years ago
parent
commit
22f1d2f642
  1. 64
      sabnzbd/par2file.py

64
sabnzbd/par2file.py

@ -23,12 +23,15 @@ import logging
import os import os
import re import re
import struct import struct
from typing import Dict, Optional, Tuple from typing import Dict, Optional, Tuple, BinaryIO
from sabnzbd.constants import MEBI
from sabnzbd.encoding import correct_unknown_encoding from sabnzbd.encoding import correct_unknown_encoding
PROBABLY_PAR2_RE = re.compile(r"(.*)\.vol(\d*)[+\-](\d*)\.par2", re.I) PROBABLY_PAR2_RE = re.compile(r"(.*)\.vol(\d*)[+\-](\d*)\.par2", re.I)
SCAN_LIMIT = 10 * MEBI
PAR_PKT_ID = b"PAR2\x00PKT" PAR_PKT_ID = b"PAR2\x00PKT"
PAR_MAIN_ID = b"PAR 2.0\x00Main\x00\x00\x00\x00"
PAR_FILE_ID = b"PAR 2.0\x00FileDesc" PAR_FILE_ID = b"PAR 2.0\x00FileDesc"
PAR_CREATOR_ID = b"PAR 2.0\x00Creator\x00" PAR_CREATOR_ID = b"PAR 2.0\x00Creator\x00"
PAR_RECOVERY_ID = b"RecvSlic" PAR_RECOVERY_ID = b"RecvSlic"
@ -91,17 +94,17 @@ def parse_par2_file(fname: str, md5of16k: Dict[bytes, str]) -> Dict[str, bytes]:
For a full description of the par2 specification, visit: For a full description of the par2 specification, visit:
http://parchive.sourceforge.net/docs/specifications/parity-volume-spec/article-spec.html http://parchive.sourceforge.net/docs/specifications/parity-volume-spec/article-spec.html
""" """
total_size = os.path.getsize(fname)
table = {} table = {}
duplicates16k = [] duplicates16k = []
total_nr_files = None
try: try:
with open(fname, "rb") as f: with open(fname, "rb") as f:
header = f.read(8) header = f.read(8)
while header: while header:
name, filehash, hash16k = parse_par2_file_packet(f, header) name, filehash, hash16k, nr_files = parse_par2_packet(f, header)
if name: if name:
if name in table:
break
table[name] = filehash table[name] = filehash
if hash16k not in md5of16k: if hash16k not in md5of16k:
md5of16k[hash16k] = name md5of16k[hash16k] = name
@ -110,6 +113,15 @@ def parse_par2_file(fname: str, md5of16k: Dict[bytes, str]) -> Dict[str, bytes]:
# Remove to avoid false-renames # Remove to avoid false-renames
duplicates16k.append(hash16k) duplicates16k.append(hash16k)
# Store the number of files for later
if nr_files:
total_nr_files = nr_files
# On large files, we stop after seeing all the listings
# On smaller files, we scan them fully to get the par2-creator
if total_size > SCAN_LIMIT and len(table) == total_nr_files:
break
header = f.read(8) header = f.read(8)
except (struct.error, IndexError): except (struct.error, IndexError):
@ -131,10 +143,12 @@ def parse_par2_file(fname: str, md5of16k: Dict[bytes, str]) -> Dict[str, bytes]:
return table return table
def parse_par2_file_packet(f, header) -> Tuple[Optional[str], Optional[bytes], Optional[bytes]]: def parse_par2_packet(
"""Look up and analyze a FileDesc package""" f: BinaryIO, header: bytes
) -> Tuple[Optional[str], Optional[bytes], Optional[bytes], Optional[int]]:
"""Look up and analyze a PAR2 packet"""
nothing = None, None, None filename, filehash, hash16k, nr_files = nothing = None, None, None, None
# All packages start with a header before the body # All packages start with a header before the body
# 8 : PAR2\x00PKT # 8 : PAR2\x00PKT
@ -163,27 +177,31 @@ def parse_par2_file_packet(f, header) -> Tuple[Optional[str], Optional[bytes], O
if md5sum != md5.digest(): if md5sum != md5.digest():
return nothing return nothing
# The FileDesc packet looks like: # See if it's any of the packages we care about
# 16 : "PAR 2.0\0FileDesc"
# 16 : FileId
# 16 : Hash for full file **
# 16 : Hash for first 16K
# 8 : File length
# xx : Name (multiple of 4, padded with \0 if needed) **
# See if it's the right packet and get name + hash
offset = 16 offset = 16
par2id = data[offset : offset + 16] par2_packet_type = data[offset : offset + 16]
if par2id == PAR_FILE_ID: if par2_packet_type == PAR_FILE_ID:
# The FileDesc packet looks like:
# 16 : "PAR 2.0\0FileDesc"
# 16 : FileId
# 16 : Hash for full file
# 16 : Hash for first 16K
# 8 : File length
# xx : Name (multiple of 4, padded with \0 if needed)
filehash = data[offset + 32 : offset + 48] filehash = data[offset + 32 : offset + 48]
hash16k = data[offset + 48 : offset + 64] hash16k = data[offset + 48 : offset + 64]
filename = correct_unknown_encoding(data[offset + 72 :].strip(b"\0")) filename = correct_unknown_encoding(data[offset + 72 :].strip(b"\0"))
return filename, filehash, hash16k elif par2_packet_type == PAR_CREATOR_ID:
elif par2id == PAR_CREATOR_ID:
# From here until the end is the creator-text # From here until the end is the creator-text
# Useful in case of bugs in the par2-creating software # Useful in case of bugs in the par2-creating software
par2creator = data[offset + 16 :].strip(b"\0") # Remove any trailing \0 par2creator = data[offset + 16 :].strip(b"\0") # Remove any trailing \0
logging.debug("Par2-creator of %s is: %s", os.path.basename(f.name), correct_unknown_encoding(par2creator)) logging.debug("Par2-creator of %s is: %s", os.path.basename(f.name), correct_unknown_encoding(par2creator))
elif par2_packet_type == PAR_MAIN_ID:
return nothing # The Main packet looks like:
# 16 : "PAR 2.0\0Main"
# 8 : Slice size
# 4 : Number of files in the recovery set
nr_files = struct.unpack("<I", data[offset + 24 : offset + 28])[0]
return filename, filehash, hash16k, nr_files

Loading…
Cancel
Save