11 changed files with 568 additions and 118 deletions
@ -0,0 +1,115 @@ |
|||
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com> |
|||
# |
|||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|||
# you may not use this file except in compliance with the License. |
|||
# You may obtain a copy of the License at |
|||
# |
|||
# http://www.apache.org/licenses/LICENSE-2.0 |
|||
# |
|||
# Unless required by applicable law or agreed to in writing, software |
|||
# distributed under the License is distributed on an "AS IS" BASIS, |
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
# See the License for the specific language governing permissions and |
|||
# limitations under the License. |
|||
|
|||
from logr import Logr |
|||
from caper import FragmentMatcher |
|||
from caper.parsers.base import Parser |
|||
|
|||
|
|||
PATTERN_GROUPS = [ |
|||
('usenet', [ |
|||
r'\[(?P<group>#[\w\.@]+)\]', |
|||
r'^\[(?P<code>\w+)\]$', |
|||
r'\[(?P<full>FULL)\]', |
|||
r'\[\s?(?P<group>TOWN)\s?\]', |
|||
r'(.*?\s)?[_\W]*(?P<site>www\..*?\.[a-z0-9]+)[_\W]*(.*?\s)?', |
|||
r'(.*?\s)?[_\W]*(?P<site>(www\.)?[-\w]+\.(com|org|info))[_\W]*(.*?\s)?' |
|||
]), |
|||
|
|||
('part', [ |
|||
r'.?(?P<current>\d+)/(?P<total>\d+).?' |
|||
]), |
|||
|
|||
('detail', [ |
|||
r'[\s-]*\w*?[\s-]*\"(?P<file_name>.*?)\"[\s-]*\w*?[\s-]*(?P<size>[\d,\.]*\s?MB)?[\s-]*(?P<extra>yEnc)?', |
|||
r'(?P<size>[\d,\.]*\s?MB)[\s-]*(?P<extra>yEnc)', |
|||
r'(?P<size>[\d,\.]*\s?MB)|(?P<extra>yEnc)' |
|||
]) |
|||
] |
|||
|
|||
|
|||
class UsenetParser(Parser): |
|||
matcher = None |
|||
|
|||
def __init__(self, debug=False): |
|||
if not UsenetParser.matcher: |
|||
UsenetParser.matcher = FragmentMatcher(PATTERN_GROUPS) |
|||
Logr.info("Fragment matcher for %s created", self.__class__.__name__) |
|||
|
|||
super(UsenetParser, self).__init__(UsenetParser.matcher, debug) |
|||
|
|||
def run(self, closures): |
|||
""" |
|||
:type closures: list of CaperClosure |
|||
""" |
|||
|
|||
self.setup(closures) |
|||
|
|||
# Capture usenet or part info until we get a part or matching fails |
|||
self.capture_closure('usenet', regex='usenet', single=False)\ |
|||
.capture_closure('part', regex='part', single=True) \ |
|||
.until_result(tag='part') \ |
|||
.until_failure()\ |
|||
.execute() |
|||
|
|||
is_town_release, has_part = self.get_state() |
|||
|
|||
if not is_town_release: |
|||
self.capture_release_name() |
|||
|
|||
# If we already have the part (TOWN releases), ignore matching part again |
|||
if not is_town_release and not has_part: |
|||
self.capture_fragment('part', regex='part', single=True)\ |
|||
.until_closure(node__re='usenet')\ |
|||
.until_success()\ |
|||
.execute() |
|||
|
|||
# Capture any leftover details |
|||
self.capture_closure('usenet', regex='usenet', single=False)\ |
|||
.capture_closure('detail', regex='detail', single=False)\ |
|||
.execute() |
|||
|
|||
self.result.build() |
|||
return self.result |
|||
|
|||
def capture_release_name(self): |
|||
self.capture_closure('detail', regex='detail', single=False)\ |
|||
.until_failure()\ |
|||
.execute() |
|||
|
|||
self.capture_fragment('release_name', single=False, include_separators=True) \ |
|||
.until_closure(node__re='usenet') \ |
|||
.until_closure(node__re='detail') \ |
|||
.until_closure(node__re='part') \ |
|||
.until_fragment(value__eq='-')\ |
|||
.execute() |
|||
|
|||
# Capture any detail after the release name |
|||
self.capture_closure('detail', regex='detail', single=False)\ |
|||
.until_failure()\ |
|||
.execute() |
|||
|
|||
def get_state(self): |
|||
# TODO multiple-chains? |
|||
is_town_release = False |
|||
has_part = False |
|||
|
|||
for tag, result in self.result.heads[0].captured(): |
|||
if tag == 'usenet' and result.get('group') == 'TOWN': |
|||
is_town_release = True |
|||
|
|||
if tag == 'part': |
|||
has_part = True |
|||
|
|||
return is_town_release, has_part |
Loading…
Reference in new issue