You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
205 lines
6.3 KiB
205 lines
6.3 KiB
from flask.helpers import json
|
|
from hachoir_core.cmd_line import unicodeFilename
|
|
from hachoir_metadata import extractMetadata
|
|
from hachoir_parser import createParser
|
|
import datetime
|
|
import re
|
|
import sys
|
|
|
|
|
|
def getMetadata(filename):
|
|
filename, realname = unicodeFilename(filename), filename
|
|
parser = createParser(filename, realname)
|
|
try:
|
|
metadata = extractMetadata(parser)
|
|
except:
|
|
return None
|
|
|
|
if metadata is not None:
|
|
metadata = metadata.exportPlaintext()
|
|
return metadata
|
|
return None
|
|
|
|
def parseMetadata(meta, jsonsafe=True):
|
|
'''
|
|
Return a dict of section headings like 'Video stream' or 'Audio stream'. Each key will have a list of dicts.
|
|
This supports multiple video/audio/subtitle/whatever streams per stream type. Each element in the list of streams
|
|
will he a dict with keys like 'Image height' and 'Compression'...anything that hachoir is able to extract.
|
|
|
|
An example output:
|
|
{'Audio stream': [{u'Channel': u'6',
|
|
u'Compression': u'A_AC3',
|
|
u'Sample rate': u'48.0 kHz'}],
|
|
u'Common': [{u'Creation date': u'2008-03-20 09:09:43',
|
|
u'Duration': u'1 hour 40 min 6 sec',
|
|
u'Endianness': u'Big endian',
|
|
u'MIME type': u'video/x-matroska',
|
|
u'Producer': u'libebml v0.7.7 + libmatroska v0.8.1'}],
|
|
'Video stream': [{u'Compression': u'V_MPEG4/ISO/AVC',
|
|
u'Image height': u'688 pixels',
|
|
u'Image width': u'1280 pixels',
|
|
u'Language': u'English'}]}
|
|
'''
|
|
if not meta:
|
|
return
|
|
sections = {}
|
|
what = []
|
|
for line in meta:
|
|
#if line doesn't start with "- " it is a section heading
|
|
if line[:2] != "- ":
|
|
section = line.strip(":").lower()
|
|
|
|
#lets collapse multiple stream headings into one...
|
|
search = re.search(r'#\d+\Z', section)
|
|
if search:
|
|
section = re.sub(search.group(), '', section).strip()
|
|
|
|
if section not in sections:
|
|
sections[section] = [dict()]
|
|
else:
|
|
sections[section].append(dict())
|
|
else:
|
|
#This isn't a section heading, so we put it in the last section heading we found.
|
|
#meta always starts out with a section heading so 'section' will always be defined
|
|
i = line.find(":")
|
|
key = line[2:i].lower()
|
|
value = _parseValue(section, key, line[i+2:])
|
|
|
|
if value is None:
|
|
value = line[i+2:]
|
|
|
|
if jsonsafe:
|
|
try:
|
|
v = json.dumps(value)
|
|
except TypeError:
|
|
value = str(value)
|
|
|
|
sections[section][-1][key] = value
|
|
|
|
|
|
|
|
return sections
|
|
|
|
def _parseValue(section, key, value, jsonsafe = True):
|
|
'''
|
|
Tediously check all the types that we know about (checked over 7k videos to find these)
|
|
and convert them to python native types.
|
|
|
|
If jsonsafe is True, we'll make json-unfriendly types like datetime into json-friendly.
|
|
'''
|
|
|
|
date_search = re.search("\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d", value)
|
|
|
|
if key == 'bit rate':
|
|
ret = _parseBitRate(value.lower())
|
|
elif key == 'bits/sample' or key == 'bits/pixel':
|
|
try:
|
|
bits = int(value.split()[0])
|
|
ret = bits
|
|
except:
|
|
ret = None
|
|
elif key == 'channel':
|
|
if value == 'stereo':
|
|
ret = 2
|
|
elif value == 'mono':
|
|
ret = 1
|
|
else:
|
|
try:
|
|
channels = int(value)
|
|
ret = channels
|
|
except:
|
|
ret = None
|
|
elif key == 'compression':
|
|
ret = _parseCompression(value)
|
|
elif key == 'compression rate':
|
|
try:
|
|
ret = float(value.split('x')[0])
|
|
except:
|
|
ret = None
|
|
elif key == 'duration':
|
|
try:
|
|
ret = _parseDuration(value)
|
|
except:
|
|
ret = None
|
|
elif key == 'sample rate':
|
|
try:
|
|
ret = float(value.split()[0]) * 1000
|
|
except:
|
|
ret = None
|
|
elif key == 'frame rate':
|
|
try:
|
|
ret = float(value.split()[0])
|
|
except:
|
|
pass
|
|
elif key == 'image height' or key == 'image width':
|
|
pixels = re.match("(?P<pixels>\d{1,4}) pixel", value)
|
|
if pixels:
|
|
ret = int(pixels.group('pixels'))
|
|
else:
|
|
ret = None
|
|
elif date_search:
|
|
try:
|
|
ret = datetime.datetime.strptime(date_search.group(), "%Y-%m-%d %H:%M:%S")
|
|
except:
|
|
ret = None
|
|
else:
|
|
#If it's something we don't know about...
|
|
ret = None
|
|
|
|
return ret
|
|
|
|
def _parseDuration(value):
|
|
t = re.search(r"((?P<hour>\d+) hour(s|))? ?((?P<min>\d+) min)? ?((?P<sec>\d+) sec)? ?((?P<ms>\d+) ms)?", value)
|
|
if t:
|
|
hour = 0 if not t.group('hour') else int(t.group('hour'))
|
|
min = 0 if not t.group('min') else int(t.group('min'))
|
|
sec = 0 if not t.group('sec') else int(t.group('sec'))
|
|
ms = 0 if not t.group('ms') else int(t.group('ms'))
|
|
return datetime.timedelta(hours = hour, minutes = min, seconds = sec, milliseconds = ms)
|
|
|
|
def _parseCompression(value):
|
|
codecs = {
|
|
'v_mpeg4/iso/avc': 'AVC',
|
|
'x264': 'AVC',
|
|
'divx': 'divx',
|
|
'xvid': 'xvid',
|
|
'v_ms/vfw/fourcc': 'vfw',
|
|
'vorbis': 'vorbis',
|
|
'xvid': 'xvid',
|
|
'mpeg layer 3': 'mp3',
|
|
'a_dts': 'DTS',
|
|
'a_aac': 'AAC',
|
|
'a_truehd': 'TRUEHD',
|
|
'microsoft mpeg': 'MPEG',
|
|
'ac3': 'AC3',
|
|
'wvc1': 'WVC1',
|
|
'pulse code modulation': 'PCM',
|
|
'pcm': 'PCM',
|
|
'windows media audio': 'WMA',
|
|
'windows media video': 'WMV',
|
|
's_text/ascii': 'ASCII',
|
|
's_text/utf8': 'UTF8',
|
|
's_text/ssa': 'SSA',
|
|
's_text/ass': 'ASS'
|
|
}
|
|
for codec in codecs:
|
|
if codec in value.lower():
|
|
return codecs[codec]
|
|
|
|
|
|
def _parseBitRate(value):
|
|
try:
|
|
bitrate = float(value.split()[0])
|
|
except:
|
|
return None
|
|
|
|
if 'kbit' in value.lower():
|
|
multi = 1000
|
|
elif 'mbit' in value.lower():
|
|
multi = 1000 * 1000
|
|
else:
|
|
return None
|
|
|
|
return bitrate * multi
|
|
|
|
print json.dumps(parseMetadata(getMetadata(sys.argv[1])))
|