Browse Source
* correct_extension: basics, including unittest * correct_extension: basics, including unittest * correct_extension: puremagic into requirements.txt * correct_extension: introduce a main for testing from CLI * correct_extension: parse all parameters on CLI as files * correct_extension: parse all parameters on CLI as files * correct_extension: CLI parameter "-p" for privacy output * correct_extension: has_common_extension() and most_likely_extension() * correct_extension: has_common_extension() and most_likely_extension() * correct_extension: add extension if file has no commonly used extension * correct_extension: Black happy ... hopefully * correct_extension: Black happy ... hopefully * correct_extension: process feedback, mainly the extenions lists ^H^H^H^ tuples * correct_extension: process feedback, mainly the extenions lists ^H^H^H^ tuples * correct_extension: process feedback, mainly the extenions lists ^H^H^H^ tuples * correct_extension: process feedback, mainly the extenions lists ^H^H^H^ tuples * correct_extension: cleaned up * correct_extension: cleaned up ... github-black now happy? * correct_extension: cleaned up ... github-black now happy? * correct_extension: cleaned up ... github-black now happy? * correct_extension: cleaned up ... github-black now happy? * correct_extension: cleaned up ... github-black now happy? * correct_extension: easier if-then-logic, check if new_extension_to_add is filled. * correct_extension: if puremagic does recoging txt or nzb, check ourselves * correct_extension: if puremagic does recoging txt or nzb, check ourselves * correct_extension: only files! * correct_extension: only files! * correct_extension: rNN files not common extension, plus easier testing * correct_extension: clean-up ... no more boolean extension_too * correct_extension: requirements.txt, solved a TODO, and use get_ext() * correct_extension: a comment added * correct_extension: correct typing, correct txt and nzb extension * correct_extension: extensions always with dots, bug fix in what_is_most_likely_extension() * correct_extension: back on track? * correct_extension: back on track? * correct_extension: better commentspull/1920/head
committed by
GitHub
12 changed files with 567 additions and 82 deletions
@ -0,0 +1,314 @@ |
|||
#!/usr/bin/python3 |
|||
|
|||
""" function to check and find correct extension of a (deobfuscated) file |
|||
Note: extension always contains a leading dot |
|||
""" |
|||
|
|||
|
|||
import puremagic |
|||
import os |
|||
import sys |
|||
from typing import List |
|||
from pathlib import Path |
|||
from sabnzbd.filesystem import get_ext |
|||
|
|||
# common extension from https://www.computerhope.com/issues/ch001789.htm |
|||
POPULAR_EXT = ( |
|||
"3g2", |
|||
"3gp", |
|||
"7z", |
|||
"ai", |
|||
"aif", |
|||
"apk", |
|||
"arj", |
|||
"asp", |
|||
"aspx", |
|||
"avi", |
|||
"bak", |
|||
"bat", |
|||
"bin", |
|||
"bmp", |
|||
"c", |
|||
"cab", |
|||
"cda", |
|||
"cer", |
|||
"cfg", |
|||
"cfm", |
|||
"cgi", |
|||
"cgi", |
|||
"cgi", |
|||
"class", |
|||
"com", |
|||
"cpl", |
|||
"cpp", |
|||
"cs", |
|||
"css", |
|||
"csv", |
|||
"cur", |
|||
"dat", |
|||
"db", |
|||
"dbf", |
|||
"deb", |
|||
"dll", |
|||
"dmg", |
|||
"dmp", |
|||
"doc", |
|||
"docx", |
|||
"drv", |
|||
"email", |
|||
"eml", |
|||
"emlx", |
|||
"exe", |
|||
"flv", |
|||
"fnt", |
|||
"fon", |
|||
"gadget", |
|||
"gif", |
|||
"h", |
|||
"h264", |
|||
"htm", |
|||
"html", |
|||
"icns", |
|||
"ico", |
|||
"ico", |
|||
"ini", |
|||
"iso", |
|||
"jar", |
|||
"java", |
|||
"jpeg", |
|||
"jpg", |
|||
"js", |
|||
"jsp", |
|||
"key", |
|||
"lnk", |
|||
"log", |
|||
"m4v", |
|||
"mdb", |
|||
"mid", |
|||
"midi", |
|||
"mkv", |
|||
"mov", |
|||
"mp3", |
|||
"mp4", |
|||
"mpa", |
|||
"mpeg", |
|||
"mpg", |
|||
"msg", |
|||
"msi", |
|||
"msi", |
|||
"odp", |
|||
"ods", |
|||
"odt", |
|||
"oft", |
|||
"ogg", |
|||
"ost", |
|||
"otf", |
|||
"part", |
|||
"pdf", |
|||
"php", |
|||
"php", |
|||
"pkg", |
|||
"pl", |
|||
"pl", |
|||
"pl", |
|||
"png", |
|||
"pps", |
|||
"ppt", |
|||
"pptx", |
|||
"ps", |
|||
"psd", |
|||
"pst", |
|||
"py", |
|||
"py", |
|||
"py", |
|||
"rar", |
|||
"rm", |
|||
"rpm", |
|||
"rss", |
|||
"rtf", |
|||
"sav", |
|||
"sh", |
|||
"sql", |
|||
"svg", |
|||
"swf", |
|||
"swift", |
|||
"sys", |
|||
"tar", |
|||
"tar", |
|||
"gz", |
|||
"tex", |
|||
"tif", |
|||
"tiff", |
|||
"tmp", |
|||
"toast", |
|||
"ttf", |
|||
"txt", |
|||
"vb", |
|||
"vcd", |
|||
"vcf", |
|||
"vob", |
|||
"wav", |
|||
"wma", |
|||
"wmv", |
|||
"wpd", |
|||
"wpl", |
|||
"wsf", |
|||
"xhtml", |
|||
"xls", |
|||
"xlsm", |
|||
"xlsx", |
|||
"z", |
|||
"zip", |
|||
) |
|||
|
|||
DOWNLOAD_EXT = ( |
|||
"ass", |
|||
"avi", |
|||
"bat", |
|||
"bdmv", |
|||
"bin", |
|||
"bup", |
|||
"clpi", |
|||
"crx", |
|||
"db", |
|||
"diz", |
|||
"djvu", |
|||
"docx", |
|||
"epub", |
|||
"exe", |
|||
"flac", |
|||
"gif", |
|||
"gz", |
|||
"htm", |
|||
"html", |
|||
"icns", |
|||
"ico", |
|||
"idx", |
|||
"ifo", |
|||
"img", |
|||
"inf", |
|||
"info", |
|||
"ini", |
|||
"iso", |
|||
"jpg", |
|||
"log", |
|||
"m2ts", |
|||
"m3u", |
|||
"m4a", |
|||
"mkv", |
|||
"mp3", |
|||
"mp4", |
|||
"mpls", |
|||
"mx", |
|||
"nfo", |
|||
"nib", |
|||
"nzb", |
|||
"otf", |
|||
"par2", |
|||
"part", |
|||
"pdf", |
|||
"pem", |
|||
"php", |
|||
"plist", |
|||
"png", |
|||
"py", |
|||
"rar", |
|||
"releaseinfo", |
|||
"rev", |
|||
"sfv", |
|||
"sh", |
|||
"srr", |
|||
"srs", |
|||
"srt", |
|||
"strings", |
|||
"sub", |
|||
"sup", |
|||
"sys", |
|||
"tif", |
|||
"ttf", |
|||
"txt", |
|||
"url", |
|||
"vob", |
|||
"website", |
|||
"wmv", |
|||
"xpi", |
|||
) |
|||
|
|||
# combine to one tuple, with unique entries: |
|||
ALL_EXT = tuple(set(POPULAR_EXT + DOWNLOAD_EXT)) |
|||
# prepend a dot to each extension, because we work with a leading dot in extensions |
|||
ALL_EXT = tuple(["." + i for i in ALL_EXT]) |
|||
|
|||
|
|||
def has_popular_extension(file_path: str) -> bool: |
|||
"""returns boolean if the extension of file_path is a popular, well-known extension""" |
|||
file_extension = get_ext(file_path) |
|||
return file_extension in ALL_EXT |
|||
|
|||
|
|||
def all_possible_extensions(file_path: str) -> List[str]: |
|||
"""returns a list with all possible extensions (with leading dot) for given file_path as reported by puremagic""" |
|||
extension_list = [] |
|||
for i in puremagic.magic_file(file_path): |
|||
extension_list.append(i.extension) |
|||
return extension_list |
|||
|
|||
|
|||
def what_is_most_likely_extension(file_path: str) -> str: |
|||
"""Returns most_likely extension, with a leading dot""" |
|||
for possible_extension in all_possible_extensions(file_path): |
|||
# let's see if technically-suggested extension by puremagic is also likely IRL |
|||
if possible_extension in ALL_EXT: |
|||
# Yes, looks likely |
|||
return possible_extension |
|||
|
|||
# Check if text or NZB, as puremagic is not good at that. |
|||
try: |
|||
txt = Path(file_path).read_text() |
|||
# Yes, a text file ... so let's check if it's even an NZB: |
|||
if txt.lower().find("<nzb xmlns=") >= 0 or txt.lower().find("!doctype nzb public") >= 0: |
|||
# yes, contains NZB signals: |
|||
return ".nzb" |
|||
else: |
|||
return ".txt" |
|||
except UnicodeDecodeError: |
|||
# not txt (and not nzb) |
|||
pass |
|||
|
|||
# no popular extension found, so just trust puremagic and return the first extension (if any) |
|||
try: |
|||
return all_possible_extensions(file_path)[0] |
|||
except IndexError: |
|||
return "" |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
privacy = False |
|||
|
|||
# parse all parameters on CLI as files to be ext-checked |
|||
for i in range(1, len(sys.argv)): |
|||
if sys.argv[i] == "-p": |
|||
# privacy, please ... so only print last 10 chars of a file |
|||
privacy = True |
|||
continue |
|||
|
|||
file_path = sys.argv[i] |
|||
|
|||
if privacy: |
|||
to_be_printed = file_path[-10:] |
|||
else: |
|||
to_be_printed = file_path |
|||
|
|||
if has_popular_extension(file_path): |
|||
# a common extension, so let's see what puremagic says, so that we can learn |
|||
filename, file_extension = os.path.splitext(file_path) |
|||
file_extension = file_extension[1:].lower() |
|||
|
|||
print( |
|||
"IRL-ext", |
|||
file_extension, |
|||
"most_likely", |
|||
what_is_most_likely_extension(file_path), |
|||
"puremagic", |
|||
all_possible_extensions(file_path), |
|||
) |
After Width: | Height: | Size: 8.9 KiB |
Binary file not shown.
@ -0,0 +1,84 @@ |
|||
<?xml version="1.0" encoding="utf-8"?> |
|||
<!DOCTYPE nzb PUBLIC "-//newzBin//DTD NZB 1.1//EN" "http://www.newzbin.com/DTD/nzb/nzb-1.1.dtd"> |
|||
<nzb xmlns="http://www.newzbin.com/DTD/2003/nzb"> |
|||
<file poster="blablamannetje <blabla@example.com>" date="1623601671" subject="mix no ext 38f38a34acc2 [02/10] - "inthemix.par2" yEnc (1/1) 39860"> |
|||
<groups> |
|||
<group>alt.binaries.test</group> |
|||
</groups> |
|||
<segments> |
|||
<segment bytes="41232" number="1">QoEbWuJpTnYmReOxUbFmBvLx-1623601671928@nyuu</segment> |
|||
</segments> |
|||
</file> |
|||
<file poster="blablamannetje <blabla@example.com>" date="1623601671" subject="mix no ext 38f38a34acc2 [03/10] - "inthemix.vol000+001.par2" yEnc (1/1) 40196"> |
|||
<groups> |
|||
<group>alt.binaries.test</group> |
|||
</groups> |
|||
<segments> |
|||
<segment bytes="41590" number="1">OfUzNpRoQlEkAkJwUoHxJlJj-1623601671929@nyuu</segment> |
|||
</segments> |
|||
</file> |
|||
<file poster="blablamannetje <blabla@example.com>" date="1623601671" subject="mix no ext 38f38a34acc2 [04/10] - "inthemix.vol001+002.par2" yEnc (1/1) 40532"> |
|||
<groups> |
|||
<group>alt.binaries.test</group> |
|||
</groups> |
|||
<segments> |
|||
<segment bytes="41938" number="1">TsNlKcDyMiCiNeHrMhFrQwPu-1623601671929@nyuu</segment> |
|||
</segments> |
|||
</file> |
|||
<file poster="blablamannetje <blabla@example.com>" date="1623601672" subject="mix no ext 38f38a34acc2 [06/10] - "inthemix.vol007+008.par2" yEnc (1/1) 122036"> |
|||
<groups> |
|||
<group>alt.binaries.test</group> |
|||
</groups> |
|||
<segments> |
|||
<segment bytes="125956" number="1">RvFtBzLeVzYhCiSjNkYqPkYv-1623601672004@nyuu</segment> |
|||
</segments> |
|||
</file> |
|||
<file poster="blablamannetje <blabla@example.com>" date="1623601672" subject="mix no ext 38f38a34acc2 [05/10] - "inthemix.vol003+004.par2" yEnc (1/1) 80948"> |
|||
<groups> |
|||
<group>alt.binaries.test</group> |
|||
</groups> |
|||
<segments> |
|||
<segment bytes="83601" number="1">CyBcLhFsErVvWhKaJbKySsLh-1623601672003@nyuu</segment> |
|||
</segments> |
|||
</file> |
|||
<file poster="blablamannetje <blabla@example.com>" date="1623601671" subject="mix no ext 38f38a34acc2 [01/10] - "inthemix.rar" yEnc (1/1) 528471"> |
|||
<groups> |
|||
<group>alt.binaries.test</group> |
|||
</groups> |
|||
<segments> |
|||
<segment bytes="544152" number="1">ZtFjLqEiBmQgZyHyRjIvLmDq-1623601671925@nyuu</segment> |
|||
</segments> |
|||
</file> |
|||
<file poster="blablamannetje <blabla@example.com>" date="1623601672" subject="mix no ext 38f38a34acc2 [07/10] - "inthemix.vol015+016.par2" yEnc (1/1) 164468"> |
|||
<groups> |
|||
<group>alt.binaries.test</group> |
|||
</groups> |
|||
<segments> |
|||
<segment bytes="169681" number="1">ZbBmMqCmJyRgOjAiSgMmFhUs-1623601672012@nyuu</segment> |
|||
</segments> |
|||
</file> |
|||
<file poster="blablamannetje <blabla@example.com>" date="1623601672" subject="mix no ext 38f38a34acc2 [08/10] - "inthemix.vol031+032.par2" yEnc (1/1) 209588"> |
|||
<groups> |
|||
<group>alt.binaries.test</group> |
|||
</groups> |
|||
<segments> |
|||
<segment bytes="216211" number="1">OmEhDrElGwEkYrHsTcFlYeYp-1623601672019@nyuu</segment> |
|||
</segments> |
|||
</file> |
|||
<file poster="blablamannetje <blabla@example.com>" date="1623601672" subject="mix no ext 38f38a34acc2 [09/10] - "inthemix.vol063+064.par2" yEnc (1/1) 260084"> |
|||
<groups> |
|||
<group>alt.binaries.test</group> |
|||
</groups> |
|||
<segments> |
|||
<segment bytes="268274" number="1">SkUsGaAkBjNpHoCsLtLiBcYn-1623601672044@nyuu</segment> |
|||
</segments> |
|||
</file> |
|||
<file poster="blablamannetje <blabla@example.com>" date="1623601672" subject="mix no ext 38f38a34acc2 [10/10] - "inthemix.vol127+071.par2" yEnc (1/1) 262436"> |
|||
<groups> |
|||
<group>alt.binaries.test</group> |
|||
</groups> |
|||
<segments> |
|||
<segment bytes="270671" number="1">PfYdNqVpPpLvOqTvYrXoRbQi-1623601672045@nyuu</segment> |
|||
</segments> |
|||
</file> |
|||
</nzb> |
Binary file not shown.
@ -0,0 +1,4 @@ |
|||
Yes, this is a text file. |
|||
|
|||
The END |
|||
|
@ -0,0 +1,54 @@ |
|||
#!/usr/bin/python3 -OO |
|||
# Copyright 2007-2021 The SABnzbd-Team <team@sabnzbd.org> |
|||
# |
|||
# This program is free software; you can redistribute it and/or |
|||
# modify it under the terms of the GNU General Public License |
|||
# as published by the Free Software Foundation; either version 2 |
|||
# of the License, or (at your option) any later version. |
|||
# |
|||
# This program is distributed in the hope that it will be useful, |
|||
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
# GNU General Public License for more details. |
|||
# |
|||
# You should have received a copy of the GNU General Public License |
|||
# along with this program; if not, write to the Free Software |
|||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
|||
|
|||
""" |
|||
Testing SABnzbd correct extension functionality module |
|||
""" |
|||
|
|||
import os |
|||
from tests.testhelper import * |
|||
import sabnzbd.utils.file_extension as file_extension |
|||
|
|||
|
|||
class Test_File_Extension: |
|||
def test_has_popular_extension(self): |
|||
assert file_extension.has_popular_extension("blabla/blabla.mkv") |
|||
assert file_extension.has_popular_extension("blabla/blabla.srt") |
|||
assert file_extension.has_popular_extension("djjddj/aaaaa.epub") |
|||
assert not file_extension.has_popular_extension("98ads098f098fa.a0ds98f098asdf") |
|||
|
|||
def test_what_is_most_likely_extension(self): |
|||
# These are real-content files, where the contents determine the extension |
|||
filename = "tests/data/test_file_extension/apeeengeee" # A PNG |
|||
assert os.path.isfile(filename) |
|||
assert file_extension.what_is_most_likely_extension(filename) == ".png" |
|||
|
|||
filename = "tests/data/test_file_extension/somepeedeef" # Some PDF |
|||
assert os.path.isfile(filename) |
|||
assert file_extension.what_is_most_likely_extension(filename) == ".pdf" |
|||
|
|||
filename = "tests/data/test_file_extension/my_matroska" # my Matroska MKV |
|||
assert os.path.isfile(filename) |
|||
assert file_extension.what_is_most_likely_extension(filename) == ".mkv" |
|||
|
|||
filename = "tests/data/test_file_extension/sometxtfile" # a txt file |
|||
assert os.path.isfile(filename) |
|||
assert file_extension.what_is_most_likely_extension(filename) == ".txt" |
|||
|
|||
filename = "tests/data/test_file_extension/some_nzb_file" # a NZB file |
|||
assert os.path.isfile(filename) |
|||
assert file_extension.what_is_most_likely_extension(filename) == ".nzb" |
Loading…
Reference in new issue