Browse Source
* correct_extension: basics, including unittest * correct_extension: basics, including unittest * correct_extension: puremagic into requirements.txt * correct_extension: introduce a main for testing from CLI * correct_extension: parse all parameters on CLI as files * correct_extension: parse all parameters on CLI as files * correct_extension: CLI parameter "-p" for privacy output * correct_extension: has_common_extension() and most_likely_extension() * correct_extension: has_common_extension() and most_likely_extension() * correct_extension: add extension if file has no commonly used extension * correct_extension: Black happy ... hopefully * correct_extension: Black happy ... hopefully * correct_extension: process feedback, mainly the extenions lists ^H^H^H^ tuples * correct_extension: process feedback, mainly the extenions lists ^H^H^H^ tuples * correct_extension: process feedback, mainly the extenions lists ^H^H^H^ tuples * correct_extension: process feedback, mainly the extenions lists ^H^H^H^ tuples * correct_extension: cleaned up * correct_extension: cleaned up ... github-black now happy? * correct_extension: cleaned up ... github-black now happy? * correct_extension: cleaned up ... github-black now happy? * correct_extension: cleaned up ... github-black now happy? * correct_extension: cleaned up ... github-black now happy? * correct_extension: easier if-then-logic, check if new_extension_to_add is filled. * correct_extension: if puremagic does recoging txt or nzb, check ourselves * correct_extension: if puremagic does recoging txt or nzb, check ourselves * correct_extension: only files! * correct_extension: only files! * correct_extension: rNN files not common extension, plus easier testing * correct_extension: clean-up ... no more boolean extension_too * correct_extension: requirements.txt, solved a TODO, and use get_ext() * correct_extension: a comment added * correct_extension: correct typing, correct txt and nzb extension * correct_extension: extensions always with dots, bug fix in what_is_most_likely_extension() * correct_extension: back on track? * correct_extension: back on track? * correct_extension: better commentspull/1920/head
committed by
GitHub
12 changed files with 567 additions and 82 deletions
@ -0,0 +1,314 @@ |
|||||
|
#!/usr/bin/python3 |
||||
|
|
||||
|
""" function to check and find correct extension of a (deobfuscated) file |
||||
|
Note: extension always contains a leading dot |
||||
|
""" |
||||
|
|
||||
|
|
||||
|
import puremagic |
||||
|
import os |
||||
|
import sys |
||||
|
from typing import List |
||||
|
from pathlib import Path |
||||
|
from sabnzbd.filesystem import get_ext |
||||
|
|
||||
|
# common extension from https://www.computerhope.com/issues/ch001789.htm |
||||
|
POPULAR_EXT = ( |
||||
|
"3g2", |
||||
|
"3gp", |
||||
|
"7z", |
||||
|
"ai", |
||||
|
"aif", |
||||
|
"apk", |
||||
|
"arj", |
||||
|
"asp", |
||||
|
"aspx", |
||||
|
"avi", |
||||
|
"bak", |
||||
|
"bat", |
||||
|
"bin", |
||||
|
"bmp", |
||||
|
"c", |
||||
|
"cab", |
||||
|
"cda", |
||||
|
"cer", |
||||
|
"cfg", |
||||
|
"cfm", |
||||
|
"cgi", |
||||
|
"cgi", |
||||
|
"cgi", |
||||
|
"class", |
||||
|
"com", |
||||
|
"cpl", |
||||
|
"cpp", |
||||
|
"cs", |
||||
|
"css", |
||||
|
"csv", |
||||
|
"cur", |
||||
|
"dat", |
||||
|
"db", |
||||
|
"dbf", |
||||
|
"deb", |
||||
|
"dll", |
||||
|
"dmg", |
||||
|
"dmp", |
||||
|
"doc", |
||||
|
"docx", |
||||
|
"drv", |
||||
|
"email", |
||||
|
"eml", |
||||
|
"emlx", |
||||
|
"exe", |
||||
|
"flv", |
||||
|
"fnt", |
||||
|
"fon", |
||||
|
"gadget", |
||||
|
"gif", |
||||
|
"h", |
||||
|
"h264", |
||||
|
"htm", |
||||
|
"html", |
||||
|
"icns", |
||||
|
"ico", |
||||
|
"ico", |
||||
|
"ini", |
||||
|
"iso", |
||||
|
"jar", |
||||
|
"java", |
||||
|
"jpeg", |
||||
|
"jpg", |
||||
|
"js", |
||||
|
"jsp", |
||||
|
"key", |
||||
|
"lnk", |
||||
|
"log", |
||||
|
"m4v", |
||||
|
"mdb", |
||||
|
"mid", |
||||
|
"midi", |
||||
|
"mkv", |
||||
|
"mov", |
||||
|
"mp3", |
||||
|
"mp4", |
||||
|
"mpa", |
||||
|
"mpeg", |
||||
|
"mpg", |
||||
|
"msg", |
||||
|
"msi", |
||||
|
"msi", |
||||
|
"odp", |
||||
|
"ods", |
||||
|
"odt", |
||||
|
"oft", |
||||
|
"ogg", |
||||
|
"ost", |
||||
|
"otf", |
||||
|
"part", |
||||
|
"pdf", |
||||
|
"php", |
||||
|
"php", |
||||
|
"pkg", |
||||
|
"pl", |
||||
|
"pl", |
||||
|
"pl", |
||||
|
"png", |
||||
|
"pps", |
||||
|
"ppt", |
||||
|
"pptx", |
||||
|
"ps", |
||||
|
"psd", |
||||
|
"pst", |
||||
|
"py", |
||||
|
"py", |
||||
|
"py", |
||||
|
"rar", |
||||
|
"rm", |
||||
|
"rpm", |
||||
|
"rss", |
||||
|
"rtf", |
||||
|
"sav", |
||||
|
"sh", |
||||
|
"sql", |
||||
|
"svg", |
||||
|
"swf", |
||||
|
"swift", |
||||
|
"sys", |
||||
|
"tar", |
||||
|
"tar", |
||||
|
"gz", |
||||
|
"tex", |
||||
|
"tif", |
||||
|
"tiff", |
||||
|
"tmp", |
||||
|
"toast", |
||||
|
"ttf", |
||||
|
"txt", |
||||
|
"vb", |
||||
|
"vcd", |
||||
|
"vcf", |
||||
|
"vob", |
||||
|
"wav", |
||||
|
"wma", |
||||
|
"wmv", |
||||
|
"wpd", |
||||
|
"wpl", |
||||
|
"wsf", |
||||
|
"xhtml", |
||||
|
"xls", |
||||
|
"xlsm", |
||||
|
"xlsx", |
||||
|
"z", |
||||
|
"zip", |
||||
|
) |
||||
|
|
||||
|
DOWNLOAD_EXT = ( |
||||
|
"ass", |
||||
|
"avi", |
||||
|
"bat", |
||||
|
"bdmv", |
||||
|
"bin", |
||||
|
"bup", |
||||
|
"clpi", |
||||
|
"crx", |
||||
|
"db", |
||||
|
"diz", |
||||
|
"djvu", |
||||
|
"docx", |
||||
|
"epub", |
||||
|
"exe", |
||||
|
"flac", |
||||
|
"gif", |
||||
|
"gz", |
||||
|
"htm", |
||||
|
"html", |
||||
|
"icns", |
||||
|
"ico", |
||||
|
"idx", |
||||
|
"ifo", |
||||
|
"img", |
||||
|
"inf", |
||||
|
"info", |
||||
|
"ini", |
||||
|
"iso", |
||||
|
"jpg", |
||||
|
"log", |
||||
|
"m2ts", |
||||
|
"m3u", |
||||
|
"m4a", |
||||
|
"mkv", |
||||
|
"mp3", |
||||
|
"mp4", |
||||
|
"mpls", |
||||
|
"mx", |
||||
|
"nfo", |
||||
|
"nib", |
||||
|
"nzb", |
||||
|
"otf", |
||||
|
"par2", |
||||
|
"part", |
||||
|
"pdf", |
||||
|
"pem", |
||||
|
"php", |
||||
|
"plist", |
||||
|
"png", |
||||
|
"py", |
||||
|
"rar", |
||||
|
"releaseinfo", |
||||
|
"rev", |
||||
|
"sfv", |
||||
|
"sh", |
||||
|
"srr", |
||||
|
"srs", |
||||
|
"srt", |
||||
|
"strings", |
||||
|
"sub", |
||||
|
"sup", |
||||
|
"sys", |
||||
|
"tif", |
||||
|
"ttf", |
||||
|
"txt", |
||||
|
"url", |
||||
|
"vob", |
||||
|
"website", |
||||
|
"wmv", |
||||
|
"xpi", |
||||
|
) |
||||
|
|
||||
|
# combine to one tuple, with unique entries: |
||||
|
ALL_EXT = tuple(set(POPULAR_EXT + DOWNLOAD_EXT)) |
||||
|
# prepend a dot to each extension, because we work with a leading dot in extensions |
||||
|
ALL_EXT = tuple(["." + i for i in ALL_EXT]) |
||||
|
|
||||
|
|
||||
|
def has_popular_extension(file_path: str) -> bool: |
||||
|
"""returns boolean if the extension of file_path is a popular, well-known extension""" |
||||
|
file_extension = get_ext(file_path) |
||||
|
return file_extension in ALL_EXT |
||||
|
|
||||
|
|
||||
|
def all_possible_extensions(file_path: str) -> List[str]: |
||||
|
"""returns a list with all possible extensions (with leading dot) for given file_path as reported by puremagic""" |
||||
|
extension_list = [] |
||||
|
for i in puremagic.magic_file(file_path): |
||||
|
extension_list.append(i.extension) |
||||
|
return extension_list |
||||
|
|
||||
|
|
||||
|
def what_is_most_likely_extension(file_path: str) -> str: |
||||
|
"""Returns most_likely extension, with a leading dot""" |
||||
|
for possible_extension in all_possible_extensions(file_path): |
||||
|
# let's see if technically-suggested extension by puremagic is also likely IRL |
||||
|
if possible_extension in ALL_EXT: |
||||
|
# Yes, looks likely |
||||
|
return possible_extension |
||||
|
|
||||
|
# Check if text or NZB, as puremagic is not good at that. |
||||
|
try: |
||||
|
txt = Path(file_path).read_text() |
||||
|
# Yes, a text file ... so let's check if it's even an NZB: |
||||
|
if txt.lower().find("<nzb xmlns=") >= 0 or txt.lower().find("!doctype nzb public") >= 0: |
||||
|
# yes, contains NZB signals: |
||||
|
return ".nzb" |
||||
|
else: |
||||
|
return ".txt" |
||||
|
except UnicodeDecodeError: |
||||
|
# not txt (and not nzb) |
||||
|
pass |
||||
|
|
||||
|
# no popular extension found, so just trust puremagic and return the first extension (if any) |
||||
|
try: |
||||
|
return all_possible_extensions(file_path)[0] |
||||
|
except IndexError: |
||||
|
return "" |
||||
|
|
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
privacy = False |
||||
|
|
||||
|
# parse all parameters on CLI as files to be ext-checked |
||||
|
for i in range(1, len(sys.argv)): |
||||
|
if sys.argv[i] == "-p": |
||||
|
# privacy, please ... so only print last 10 chars of a file |
||||
|
privacy = True |
||||
|
continue |
||||
|
|
||||
|
file_path = sys.argv[i] |
||||
|
|
||||
|
if privacy: |
||||
|
to_be_printed = file_path[-10:] |
||||
|
else: |
||||
|
to_be_printed = file_path |
||||
|
|
||||
|
if has_popular_extension(file_path): |
||||
|
# a common extension, so let's see what puremagic says, so that we can learn |
||||
|
filename, file_extension = os.path.splitext(file_path) |
||||
|
file_extension = file_extension[1:].lower() |
||||
|
|
||||
|
print( |
||||
|
"IRL-ext", |
||||
|
file_extension, |
||||
|
"most_likely", |
||||
|
what_is_most_likely_extension(file_path), |
||||
|
"puremagic", |
||||
|
all_possible_extensions(file_path), |
||||
|
) |
After Width: | Height: | Size: 8.9 KiB |
Binary file not shown.
@ -0,0 +1,84 @@ |
|||||
|
<?xml version="1.0" encoding="utf-8"?> |
||||
|
<!DOCTYPE nzb PUBLIC "-//newzBin//DTD NZB 1.1//EN" "http://www.newzbin.com/DTD/nzb/nzb-1.1.dtd"> |
||||
|
<nzb xmlns="http://www.newzbin.com/DTD/2003/nzb"> |
||||
|
<file poster="blablamannetje <blabla@example.com>" date="1623601671" subject="mix no ext 38f38a34acc2 [02/10] - "inthemix.par2" yEnc (1/1) 39860"> |
||||
|
<groups> |
||||
|
<group>alt.binaries.test</group> |
||||
|
</groups> |
||||
|
<segments> |
||||
|
<segment bytes="41232" number="1">QoEbWuJpTnYmReOxUbFmBvLx-1623601671928@nyuu</segment> |
||||
|
</segments> |
||||
|
</file> |
||||
|
<file poster="blablamannetje <blabla@example.com>" date="1623601671" subject="mix no ext 38f38a34acc2 [03/10] - "inthemix.vol000+001.par2" yEnc (1/1) 40196"> |
||||
|
<groups> |
||||
|
<group>alt.binaries.test</group> |
||||
|
</groups> |
||||
|
<segments> |
||||
|
<segment bytes="41590" number="1">OfUzNpRoQlEkAkJwUoHxJlJj-1623601671929@nyuu</segment> |
||||
|
</segments> |
||||
|
</file> |
||||
|
<file poster="blablamannetje <blabla@example.com>" date="1623601671" subject="mix no ext 38f38a34acc2 [04/10] - "inthemix.vol001+002.par2" yEnc (1/1) 40532"> |
||||
|
<groups> |
||||
|
<group>alt.binaries.test</group> |
||||
|
</groups> |
||||
|
<segments> |
||||
|
<segment bytes="41938" number="1">TsNlKcDyMiCiNeHrMhFrQwPu-1623601671929@nyuu</segment> |
||||
|
</segments> |
||||
|
</file> |
||||
|
<file poster="blablamannetje <blabla@example.com>" date="1623601672" subject="mix no ext 38f38a34acc2 [06/10] - "inthemix.vol007+008.par2" yEnc (1/1) 122036"> |
||||
|
<groups> |
||||
|
<group>alt.binaries.test</group> |
||||
|
</groups> |
||||
|
<segments> |
||||
|
<segment bytes="125956" number="1">RvFtBzLeVzYhCiSjNkYqPkYv-1623601672004@nyuu</segment> |
||||
|
</segments> |
||||
|
</file> |
||||
|
<file poster="blablamannetje <blabla@example.com>" date="1623601672" subject="mix no ext 38f38a34acc2 [05/10] - "inthemix.vol003+004.par2" yEnc (1/1) 80948"> |
||||
|
<groups> |
||||
|
<group>alt.binaries.test</group> |
||||
|
</groups> |
||||
|
<segments> |
||||
|
<segment bytes="83601" number="1">CyBcLhFsErVvWhKaJbKySsLh-1623601672003@nyuu</segment> |
||||
|
</segments> |
||||
|
</file> |
||||
|
<file poster="blablamannetje <blabla@example.com>" date="1623601671" subject="mix no ext 38f38a34acc2 [01/10] - "inthemix.rar" yEnc (1/1) 528471"> |
||||
|
<groups> |
||||
|
<group>alt.binaries.test</group> |
||||
|
</groups> |
||||
|
<segments> |
||||
|
<segment bytes="544152" number="1">ZtFjLqEiBmQgZyHyRjIvLmDq-1623601671925@nyuu</segment> |
||||
|
</segments> |
||||
|
</file> |
||||
|
<file poster="blablamannetje <blabla@example.com>" date="1623601672" subject="mix no ext 38f38a34acc2 [07/10] - "inthemix.vol015+016.par2" yEnc (1/1) 164468"> |
||||
|
<groups> |
||||
|
<group>alt.binaries.test</group> |
||||
|
</groups> |
||||
|
<segments> |
||||
|
<segment bytes="169681" number="1">ZbBmMqCmJyRgOjAiSgMmFhUs-1623601672012@nyuu</segment> |
||||
|
</segments> |
||||
|
</file> |
||||
|
<file poster="blablamannetje <blabla@example.com>" date="1623601672" subject="mix no ext 38f38a34acc2 [08/10] - "inthemix.vol031+032.par2" yEnc (1/1) 209588"> |
||||
|
<groups> |
||||
|
<group>alt.binaries.test</group> |
||||
|
</groups> |
||||
|
<segments> |
||||
|
<segment bytes="216211" number="1">OmEhDrElGwEkYrHsTcFlYeYp-1623601672019@nyuu</segment> |
||||
|
</segments> |
||||
|
</file> |
||||
|
<file poster="blablamannetje <blabla@example.com>" date="1623601672" subject="mix no ext 38f38a34acc2 [09/10] - "inthemix.vol063+064.par2" yEnc (1/1) 260084"> |
||||
|
<groups> |
||||
|
<group>alt.binaries.test</group> |
||||
|
</groups> |
||||
|
<segments> |
||||
|
<segment bytes="268274" number="1">SkUsGaAkBjNpHoCsLtLiBcYn-1623601672044@nyuu</segment> |
||||
|
</segments> |
||||
|
</file> |
||||
|
<file poster="blablamannetje <blabla@example.com>" date="1623601672" subject="mix no ext 38f38a34acc2 [10/10] - "inthemix.vol127+071.par2" yEnc (1/1) 262436"> |
||||
|
<groups> |
||||
|
<group>alt.binaries.test</group> |
||||
|
</groups> |
||||
|
<segments> |
||||
|
<segment bytes="270671" number="1">PfYdNqVpPpLvOqTvYrXoRbQi-1623601672045@nyuu</segment> |
||||
|
</segments> |
||||
|
</file> |
||||
|
</nzb> |
Binary file not shown.
@ -0,0 +1,4 @@ |
|||||
|
Yes, this is a text file. |
||||
|
|
||||
|
The END |
||||
|
|
@ -0,0 +1,54 @@ |
|||||
|
#!/usr/bin/python3 -OO |
||||
|
# Copyright 2007-2021 The SABnzbd-Team <team@sabnzbd.org> |
||||
|
# |
||||
|
# This program is free software; you can redistribute it and/or |
||||
|
# modify it under the terms of the GNU General Public License |
||||
|
# as published by the Free Software Foundation; either version 2 |
||||
|
# of the License, or (at your option) any later version. |
||||
|
# |
||||
|
# This program is distributed in the hope that it will be useful, |
||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
|
# GNU General Public License for more details. |
||||
|
# |
||||
|
# You should have received a copy of the GNU General Public License |
||||
|
# along with this program; if not, write to the Free Software |
||||
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
||||
|
|
||||
|
""" |
||||
|
Testing SABnzbd correct extension functionality module |
||||
|
""" |
||||
|
|
||||
|
import os |
||||
|
from tests.testhelper import * |
||||
|
import sabnzbd.utils.file_extension as file_extension |
||||
|
|
||||
|
|
||||
|
class Test_File_Extension: |
||||
|
def test_has_popular_extension(self): |
||||
|
assert file_extension.has_popular_extension("blabla/blabla.mkv") |
||||
|
assert file_extension.has_popular_extension("blabla/blabla.srt") |
||||
|
assert file_extension.has_popular_extension("djjddj/aaaaa.epub") |
||||
|
assert not file_extension.has_popular_extension("98ads098f098fa.a0ds98f098asdf") |
||||
|
|
||||
|
def test_what_is_most_likely_extension(self): |
||||
|
# These are real-content files, where the contents determine the extension |
||||
|
filename = "tests/data/test_file_extension/apeeengeee" # A PNG |
||||
|
assert os.path.isfile(filename) |
||||
|
assert file_extension.what_is_most_likely_extension(filename) == ".png" |
||||
|
|
||||
|
filename = "tests/data/test_file_extension/somepeedeef" # Some PDF |
||||
|
assert os.path.isfile(filename) |
||||
|
assert file_extension.what_is_most_likely_extension(filename) == ".pdf" |
||||
|
|
||||
|
filename = "tests/data/test_file_extension/my_matroska" # my Matroska MKV |
||||
|
assert os.path.isfile(filename) |
||||
|
assert file_extension.what_is_most_likely_extension(filename) == ".mkv" |
||||
|
|
||||
|
filename = "tests/data/test_file_extension/sometxtfile" # a txt file |
||||
|
assert os.path.isfile(filename) |
||||
|
assert file_extension.what_is_most_likely_extension(filename) == ".txt" |
||||
|
|
||||
|
filename = "tests/data/test_file_extension/some_nzb_file" # a NZB file |
||||
|
assert os.path.isfile(filename) |
||||
|
assert file_extension.what_is_most_likely_extension(filename) == ".nzb" |
Loading…
Reference in new issue