|
|
@ -23,12 +23,15 @@ import logging |
|
|
|
import os |
|
|
|
import re |
|
|
|
import struct |
|
|
|
from typing import Dict, Optional, Tuple |
|
|
|
from typing import Dict, Optional, Tuple, BinaryIO |
|
|
|
|
|
|
|
from sabnzbd.constants import MEBI |
|
|
|
from sabnzbd.encoding import correct_unknown_encoding |
|
|
|
|
|
|
|
PROBABLY_PAR2_RE = re.compile(r"(.*)\.vol(\d*)[+\-](\d*)\.par2", re.I) |
|
|
|
SCAN_LIMIT = 10 * MEBI |
|
|
|
PAR_PKT_ID = b"PAR2\x00PKT" |
|
|
|
PAR_MAIN_ID = b"PAR 2.0\x00Main\x00\x00\x00\x00" |
|
|
|
PAR_FILE_ID = b"PAR 2.0\x00FileDesc" |
|
|
|
PAR_CREATOR_ID = b"PAR 2.0\x00Creator\x00" |
|
|
|
PAR_RECOVERY_ID = b"RecvSlic" |
|
|
@ -91,17 +94,17 @@ def parse_par2_file(fname: str, md5of16k: Dict[bytes, str]) -> Dict[str, bytes]: |
|
|
|
For a full description of the par2 specification, visit: |
|
|
|
http://parchive.sourceforge.net/docs/specifications/parity-volume-spec/article-spec.html |
|
|
|
""" |
|
|
|
total_size = os.path.getsize(fname) |
|
|
|
table = {} |
|
|
|
duplicates16k = [] |
|
|
|
total_nr_files = None |
|
|
|
|
|
|
|
try: |
|
|
|
with open(fname, "rb") as f: |
|
|
|
header = f.read(8) |
|
|
|
while header: |
|
|
|
name, filehash, hash16k = parse_par2_file_packet(f, header) |
|
|
|
name, filehash, hash16k, nr_files = parse_par2_packet(f, header) |
|
|
|
if name: |
|
|
|
if name in table: |
|
|
|
break |
|
|
|
table[name] = filehash |
|
|
|
if hash16k not in md5of16k: |
|
|
|
md5of16k[hash16k] = name |
|
|
@ -110,6 +113,15 @@ def parse_par2_file(fname: str, md5of16k: Dict[bytes, str]) -> Dict[str, bytes]: |
|
|
|
# Remove to avoid false-renames |
|
|
|
duplicates16k.append(hash16k) |
|
|
|
|
|
|
|
# Store the number of files for later |
|
|
|
if nr_files: |
|
|
|
total_nr_files = nr_files |
|
|
|
|
|
|
|
# On large files, we stop after seeing all the listings |
|
|
|
# On smaller files, we scan them fully to get the par2-creator |
|
|
|
if total_size > SCAN_LIMIT and len(table) == total_nr_files: |
|
|
|
break |
|
|
|
|
|
|
|
header = f.read(8) |
|
|
|
|
|
|
|
except (struct.error, IndexError): |
|
|
@ -131,10 +143,12 @@ def parse_par2_file(fname: str, md5of16k: Dict[bytes, str]) -> Dict[str, bytes]: |
|
|
|
return table |
|
|
|
|
|
|
|
|
|
|
|
def parse_par2_file_packet(f, header) -> Tuple[Optional[str], Optional[bytes], Optional[bytes]]: |
|
|
|
"""Look up and analyze a FileDesc package""" |
|
|
|
def parse_par2_packet( |
|
|
|
f: BinaryIO, header: bytes |
|
|
|
) -> Tuple[Optional[str], Optional[bytes], Optional[bytes], Optional[int]]: |
|
|
|
"""Look up and analyze a PAR2 packet""" |
|
|
|
|
|
|
|
nothing = None, None, None |
|
|
|
filename, filehash, hash16k, nr_files = nothing = None, None, None, None |
|
|
|
|
|
|
|
# All packages start with a header before the body |
|
|
|
# 8 : PAR2\x00PKT |
|
|
@ -163,27 +177,31 @@ def parse_par2_file_packet(f, header) -> Tuple[Optional[str], Optional[bytes], O |
|
|
|
if md5sum != md5.digest(): |
|
|
|
return nothing |
|
|
|
|
|
|
|
# The FileDesc packet looks like: |
|
|
|
# 16 : "PAR 2.0\0FileDesc" |
|
|
|
# 16 : FileId |
|
|
|
# 16 : Hash for full file ** |
|
|
|
# 16 : Hash for first 16K |
|
|
|
# 8 : File length |
|
|
|
# xx : Name (multiple of 4, padded with \0 if needed) ** |
|
|
|
|
|
|
|
# See if it's the right packet and get name + hash |
|
|
|
# See if it's any of the packages we care about |
|
|
|
offset = 16 |
|
|
|
par2id = data[offset : offset + 16] |
|
|
|
|
|
|
|
if par2id == PAR_FILE_ID: |
|
|
|
par2_packet_type = data[offset : offset + 16] |
|
|
|
|
|
|
|
if par2_packet_type == PAR_FILE_ID: |
|
|
|
# The FileDesc packet looks like: |
|
|
|
# 16 : "PAR 2.0\0FileDesc" |
|
|
|
# 16 : FileId |
|
|
|
# 16 : Hash for full file |
|
|
|
# 16 : Hash for first 16K |
|
|
|
# 8 : File length |
|
|
|
# xx : Name (multiple of 4, padded with \0 if needed) |
|
|
|
filehash = data[offset + 32 : offset + 48] |
|
|
|
hash16k = data[offset + 48 : offset + 64] |
|
|
|
filename = correct_unknown_encoding(data[offset + 72 :].strip(b"\0")) |
|
|
|
return filename, filehash, hash16k |
|
|
|
elif par2id == PAR_CREATOR_ID: |
|
|
|
elif par2_packet_type == PAR_CREATOR_ID: |
|
|
|
# From here until the end is the creator-text |
|
|
|
# Useful in case of bugs in the par2-creating software |
|
|
|
par2creator = data[offset + 16 :].strip(b"\0") # Remove any trailing \0 |
|
|
|
logging.debug("Par2-creator of %s is: %s", os.path.basename(f.name), correct_unknown_encoding(par2creator)) |
|
|
|
|
|
|
|
return nothing |
|
|
|
elif par2_packet_type == PAR_MAIN_ID: |
|
|
|
# The Main packet looks like: |
|
|
|
# 16 : "PAR 2.0\0Main" |
|
|
|
# 8 : Slice size |
|
|
|
# 4 : Number of files in the recovery set |
|
|
|
nr_files = struct.unpack("<I", data[offset + 24 : offset + 28])[0] |
|
|
|
|
|
|
|
return filename, filehash, hash16k, nr_files |
|
|
|