Skip to content

Commit

Permalink
Merge pull request #23 from platelminto/dev
Browse files Browse the repository at this point in the history
- Encoder/group are now the same thing - encoder. What was often listed as encoder has now been corrected to website.

- Add genre, internationalCut fields.

- Improve various fields.
  • Loading branch information
platelminto authored Nov 27, 2020
2 parents 68d38e6 + 66c54fd commit cec34e2
Show file tree
Hide file tree
Showing 10 changed files with 796 additions and 371 deletions.
2 changes: 1 addition & 1 deletion PTN/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

__author__ = 'Giorgio Momigliano'
__email__ = 'gmomigliano@protonmail.com'
__version__ = '2.2'
__version__ = '2.3'
__license__ = 'MIT'

ptn = PTN()
Expand Down
43 changes: 39 additions & 4 deletions PTN/extras.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,43 @@
#!/usr/bin/env python

# Helper functions for patterns.py

# Helper functions and constants for patterns.py

delimiters = '[\.\s\-\+_\/(),]'

langs = [('rus(?:sian)?', 'Russian'),
('(?:True)?fre?(?:nch)?', 'French'),
('(?:nu)?ita(?:liano?)?', 'Italian'),
('castellano|spa(?:nish)?|esp?', 'Spanish'),
('swedish', 'Swedish'),
('dk|dan(?:ish)?', 'Danish'),
('ger(?:man)?|deu(?:tsch)?', 'German'),
('nordic', 'Nordic'),
('exyu', 'ExYu'),
('chs|chi(?:nese)?', 'Chinese'),
('hin(?:di)?', 'Hindi'),
('polish', 'Polish'),
('mandarin', 'Mandarin'),
('kor(?:ean)?', 'Korean'),
('bengali|bangla', 'Bengali'),
('kannada', 'Kannada'),
('tam(?:il)?', 'Tamil'),
('tel(?:ugu)?', 'Telugu'),
('marathi', 'Marathi'),
('mal(?:ayalam)?', 'Malayalam'),
('japanese|ja?p', 'Japanese'),
('interslavic', 'Interslavic'),
('ara(?:bic)?', 'Arabic'),
('urdu', 'Urdu'),
('punjabi', 'Punjabi'),
('portuguese', 'Portuguese'),
('en?(?:g(?:lish)?)?', 'English') # Must be at end, matches just an 'e'
]

genres = [('Sci-?Fi', 'Sci-Fi'),
('Drama', 'Drama'),
('Comedy', 'Comedy'),
('West(?:\.|ern)?', 'Western'),
('Action', 'Action')]

# Some titles just can't be parsed without breaking everything else, so here
# are known those known exceptions. They are executed when the parsed_title and
Expand Down Expand Up @@ -33,12 +69,11 @@
'proper': [], 'extended': []}


channels = [(1, 0), (2, 0), (5, 1), (7, 1)]
channels = [(1, 0), (2, 0), (5, 1), (6, 1), (7, 1)]


# Return tuple with regexes for audio name with appended channel types, and without any channels
def get_channel_audio_options(patterns_with_names):
from .patterns import delimiters
options = list()
for (audio_pattern, name) in patterns_with_names:
for (speakers, subwoofers) in channels:
Expand Down
40 changes: 27 additions & 13 deletions PTN/parse.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env python
from . import re
from .post import post_processing_before_excess, post_processing_after_excess
from .patterns import patterns, types, delimiters, langs, patterns_ordered
from .extras import exceptions, patterns_ignore_title, link_patterns
from .patterns import patterns, types, delimiters, patterns_ordered
from .extras import exceptions, patterns_ignore_title, link_patterns, langs, genres


class PTN(object):
Expand All @@ -29,11 +29,11 @@ def _part(self, name, match_slice, clean, overwrite=False):

@staticmethod
def _clean_string(string):
clean = re.sub(r'^ -', '', string)
clean = re.sub(r'^( -|\(|\[)', '', string)
if clean.find(' ') == -1 and clean.find('.') != -1:
clean = re.sub(r'\.', ' ', clean)
clean = re.sub(r'_', ' ', clean)
clean = re.sub(r'([\[(_]|- )$', '', clean).strip()
clean = re.sub(r'([\[)_\]]|- )$', '', clean).strip()
clean = clean.strip(' _-')

return clean
Expand All @@ -50,7 +50,7 @@ def parse(self, name, standardise):
pattern_options = self.normalise_pattern_options(pattern_options)

for (pattern, replace, transforms) in pattern_options:
if key not in ('season', 'episode', 'website', 'language'):
if key not in ('season', 'episode', 'website', 'language', 'genre'):
pattern = r'\b(?:{})\b'.format(pattern)

clean_name = re.sub(r'_', ' ', self.torrent_name)
Expand All @@ -74,12 +74,12 @@ def parse(self, name, standardise):

index = self.get_match_indexes(match)

if key == 'season' or key == 'episode':
if key in ('season', 'episode'):
clean = self.get_season_episode(match)
elif key == 'language':
clean = self.get_language(match)
elif key == 'subtitles':
clean = self.get_subtitles(match)
elif key in ('language', 'genre'):
clean = self.split_multi(match)
elif key in types.keys() and types[key] == 'boolean':
clean = True
else:
Expand All @@ -100,7 +100,9 @@ def parse(self, name, standardise):
unmatched = f(self, unmatched)

# clean_unmatched() depends on the before_excess methods adding more match slices.
self._part('excess', None, self.clean_unmatched())
cleaned_unmatched = self.clean_unmatched()
if cleaned_unmatched:
self._part('excess', None, cleaned_unmatched)

for f in post_processing_after_excess:
f(self)
Expand Down Expand Up @@ -198,8 +200,8 @@ def get_season_episode(match):
return clean

@staticmethod
def get_language(match):
# handle multi subtitles
def split_multi(match):
# handle multi languages
m = re.split(r'{}+'.format(delimiters), match[0])
clean = list(filter(None, m))

Expand All @@ -211,6 +213,7 @@ def get_subtitles(match):
m = re.split(r'{}+'.format(delimiters), match[0])
m = list(filter(None, m))
clean = list()
# If it's only 1 result, it's fine if it's just 'subs'.
if len(m) == 1:
clean = m
else:
Expand All @@ -232,6 +235,8 @@ def standardise_clean(self, clean, key, replace, transforms):
clean = self.standardise_languages(clean)
if not clean:
clean = 'Available'
if key == 'genre':
clean = self.standardise_genres(clean)
return clean

@staticmethod
Expand All @@ -246,6 +251,16 @@ def standardise_languages(clean):
clean = cleaned_langs
return clean

@staticmethod
def standardise_genres(clean):
standard_genres = list()
for genre in clean:
for (regex, clean) in genres:
if re.match(regex, genre, re.IGNORECASE):
standard_genres.append(clean)
break
return standard_genres

# Merge all the match slices (such as when they overlap), then remove
# them from excess.
def merge_match_slices(self):
Expand Down Expand Up @@ -320,8 +335,7 @@ def get_unmatched(self):
for (start, end) in self.unmatched_list():
unmatched += self.torrent_name[start:end]

clean = re.sub(r'(^[-. ()]+)|([-. ]+$)', '', unmatched)
return re.sub(r'[()/]', ' ', clean)
return unmatched

def clean_unmatched(self):
unmatched = list()
Expand Down
72 changes: 26 additions & 46 deletions PTN/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,45 +10,16 @@

from .extras import *

delimiters = '[\.\s\-\+_\/()]'
langs = [('rus(?:sian)?', 'Russian'),
('(?:True)?fre?(?:nch)?', 'French'),
('(?:nu)?ita(?:liano?)?', 'Italian'),
('castellano|spa(?:nish)?|es', 'Spanish'),
('swedish', 'Swedish'),
('dk|dan(?:ish)?', 'Danish'),
('ger(?:man)?', 'German'),
('nordic', 'Nordic'),
('exyu', 'ExYu'),
('chs|chi(?:nese)?', 'Chinese'),
('hin(?:di)?', 'Hindi'),
('polish', 'Polish'),
('mandarin', 'Mandarin'),
('kor(?:ean)?', 'Korean'),
('bengali|bangla', 'Bengali'),
('kannada', 'Kannada'),
('tam(?:il)?', 'Tamil'),
('tel(?:ugu)?', 'Telugu'),
('marathi', 'Marathi'),
('mal(?:ayalam)?', 'Malayalam'),
('japanese|ja?p', 'Japanese'),
('interslavic', 'Interslavic'),
('ara(?:bic)?', 'Arabic'),
('urdu', 'Urdu'),
('punjabi', 'Punjabi'),
('portuguese', 'Portuguese'),
('en?(?:g(?:lish)?)?', 'English') # Must be at end, matches just an 'e'
]

season_range_pattern = '(?:Complete' + delimiters + '*)?' + delimiters + '*(?:s(?:easons?)?)' + delimiters + \
'*(?:s?[0-9]{1,2}[\s]*(?:(?:\-|(?:\s*to\s*))[\s]*s?[0-9]{1,2})+)(?:' + delimiters + '*Complete)?'

year_pattern = '(?:19[0-9]|20[0-2])[0-9]'
month_pattern = '0[1-9]|1[0-2]'
day_pattern = '[0-2][0-9]|3[01]'

episode_name_pattern = '((?:[Pp](?:ar)?t' + delimiters + '*[0-9]|[A-Za-z][a-z]*(?:' + delimiters + \
episode_name_pattern = '((?:[Pp](?:ar)?t' + delimiters + '*[0-9]|(?:[A-Za-z]|[0-9])[a-z]*(?:' + delimiters + \
'|$))+)'
pre_website_encoder_pattern = '[^\s\.\[\]\-\(\)]+\)\s{0,2}\[[^\s\-]+\]|[^\s\.\[\]\-\(\)]+\s{0,2}(?:-\s)?[^\s\.\[\]\-]+$'

# Forces an order to go by the regexes, as we want this to be deterministic (different
# orders can generate different matchings). e.g. "doctor_who_2005..." in input.json
Expand All @@ -57,13 +28,14 @@
'container', 'widescreen', 'website', 'documentary', 'language', 'subtitles',
'sbs', 'unrated', 'size', 'bitDepth', '3d', 'internal', 'readnfo', 'network',
'fps', 'hdr', 'limited', 'remastered', 'directorsCut', 'upscaled', 'untouched',
'remux']
'remux', 'internationalCut', 'genre']

patterns = dict()
patterns['episode'] = ['(?<![a-z])(?:e|ep)(?:[0-9]{1,2}(?:-(?:e|ep)?(?:[0-9]{1,2}))?)(?![0-9])',
# Very specific as it could match too liberally
'\s\-\s\d{1,3}\s',
r'\b[0-9]{1,2}x([0-9]{2})\b'
r'\b[0-9]{1,2}x([0-9]{2})\b',
r'\bepisod(?:e|io)' + delimiters + r'\d{1,2}\b',
]
patterns['season'] = ['\ss?(\d{1,2})\s\-\s\d{1,2}\s', # Avoids matching some anime releases season and episode as a season range
r'\b' + season_range_pattern + r'\b', # Describes season ranges
Expand All @@ -76,12 +48,12 @@
]
# The first 4 season regexes won't have 'Part' in them.
patterns['episode'] += [link_patterns(patterns['season'][4:]) + delimiters + '*P(?:ar)?t' + delimiters + '*(\d{1,3})']
patterns['year'] = '((' + year_pattern + '))'
patterns['year'] = year_pattern
patterns['month'] = '(?:{year}){d}({month}){d}(?:{day})' \
.format(d=delimiters, year=year_pattern, month=month_pattern, day=day_pattern)
patterns['day'] = '(?:{year}){d}(?:{month}){d}({day})' \
.format(d=delimiters, year=year_pattern, month=month_pattern, day=day_pattern)
patterns['resolution'] = [('([0-9]{3,4}p)', None, 'lower'),
patterns['resolution'] = [('([0-9]{3,4}(?:p|i))', None, 'lower'),
('(1280x720p?)', '720p'),
('FHD', '1080p'),
('UHD', 'UHD'),
Expand All @@ -101,6 +73,7 @@
('DVDR|DVD-Full|Full-rip', 'DVD-R'),
('PDTV|DVBRip', 'PDTV'),
('DSR(?:ip)?|SATRip|DTHRip', 'DSRip'),
('AHDTV(?:Mux)?', 'AHDTV'),
('HDTV(?:Rip)?', 'HDTV'),
('D?TVRip|DVBRip', 'TVRip'),
('VODR(?:ip)?', 'VODRip'),
Expand All @@ -120,19 +93,22 @@
('HULU', 'Hulu Networks'),
('MS?NBC', 'MSNBC'),
('DCU', 'DC Universe'),
('ID', 'Investigation Discovery'),
]
patterns['network'] = suffix_pattern_with(link_patterns(patterns['quality']),
patterns['network'], delimiters)
# Not all networks always show up just before the quality.
# Not all networks always show up just before the quality, so if they're unlikely to clash,
# they should be added here.
patterns['network'] += [('BBC', 'BBC'),
('Hoichoi', 'Hoichoi'),
('Zee5', 'ZEE5'),
('Hallmark', 'Hallmark')]
('Hallmark', 'Hallmark'),
('Sony\s?LIV', 'SONY LIV')]
patterns['codec'] = [('xvid', 'Xvid'),
('av1', 'AV1'),
('[hx]\.?264', 'H.264'),
('AVC', 'H.264'),
('[hx]\.?265', 'H.265'),
('[hx]\.?265', 'H.265'), # Separate so if both are present, it won't pollute excess.
('HEVC', 'H.265'),
('[h]\.?263', 'H.263')]
patterns['audio'] = get_channel_audio_options([
Expand All @@ -145,15 +121,15 @@
('DTS', 'DTS'),
('AAC[ \.\-]LC', 'AAC-LC'),
('AAC', 'AAC'),
('Dual[\- ]Audio', 'Dual')
]) + [('5.1(?:{d}?ch(?:annel)?(?:{d}?Audio)?)?'.format(d=delimiters), '5.1'),
('2.0(?:{d}?ch(?:annel)?(?:{d}?Audio)?)?|2CH'.format(d=delimiters), 'Dual'),
('7.1(?:{d}?ch(?:annel)?(?:{d}?Audio)?)?'.format(d=delimiters), '7.1'),
('1{d}?Ch(?:annel)?(?:{d}?Audio)?'.format(d=delimiters), 'Mono'),
('Dual{d}Audios?'.format(d=delimiters), 'Dual')
]) + [('7.1(?:{d}?ch(?:annel)?(?:{d}?Audio)?)?'.format(d=delimiters), '7.1'),
('FLAC', 'FLAC'),
('5.1(?:{d}?ch(?:annel)?(?:{d}?Audio)?)?'.format(d=delimiters), '5.1'),
('MP3', None, 'upper'),
('2.0(?:{d}?ch(?:annel)?(?:{d}?Audio)?)?|2CH|stereo'.format(d=delimiters), 'Dual'),
('1{d}?Ch(?:annel)?(?:{d}?Audio)?'.format(d=delimiters), 'Mono'),
('(?:Original|Org)' + delimiters + 'Aud(?:io)?', 'Original'),
('LiNE', 'LiNE'),
('(?:Original|Org)' + delimiters + 'Aud(?:io)?', 'Original')
]
patterns['region'] = ('R[0-9]', None, 'upper')
patterns['extended'] = '(EXTENDED(:?.CUT)?)'
Expand All @@ -167,7 +143,7 @@
patterns['website'] = '^(\[ ?([^\]]+?) ?\])'

lang_list_pattern = r'\b(?:' + link_patterns(langs) + '(?:' + \
delimiters + '+' + link_patterns(patterns['audio']) + ')?' + \
delimiters + '+(?:dub(?:bed)?|' + link_patterns(patterns['audio']) + '))?' + \
'(?:' + delimiters + r'+|\b))'
subs_list_pattern = r'\b(?:' + link_patterns(langs) + delimiters + '*)'

Expand All @@ -194,14 +170,17 @@
patterns['3d'] = '3D'
patterns['internal'] = 'iNTERNAL'
patterns['readnfo'] = 'READNFO'
patterns['hdr'] = 'HDR'
patterns['hdr'] = 'HDR(?:10)?'
patterns['documentary'] = 'DOCU(?:menta?ry)?'
patterns['limited'] = 'LIMITED'
patterns['remastered'] = 'REMASTERED'
patterns['directorsCut'] = 'DC'
patterns['upscaled'] = '(?:AI{d}*)?upscaled?'.format(d=delimiters)
patterns['untouched'] = 'untouched'
patterns['remux'] = 'REMUX'
patterns['internationalCut'] = 'International{d}Cut'.format(d=delimiters)
# Spaces are only allowed before the genre list if after a word boundary or punctuation
patterns['genre'] = r'\b\s*[\(\-\]]+\s*((?:' + link_patterns(genres) + delimiters + r'?)+)\b'

types = {
'season': 'integer',
Expand All @@ -228,4 +207,5 @@
'upscaled': 'boolean',
'untouched': 'boolean',
'remux': 'boolean',
'internationalCut': 'boolean',
}
Loading

0 comments on commit cec34e2

Please sign in to comment.