From f73a033045c68c3fa03f2d3bdb1fbe5171cb783c Mon Sep 17 00:00:00 2001 From: Andrei Pradan Date: Wed, 27 Jul 2016 17:29:38 +0300 Subject: [PATCH] Python3 compatibility tweaks --- docs/conf.py | 16 +- pycaption/base.py | 58 +- pycaption/dfxp/__init__.py | 2 - pycaption/dfxp/base.py | 312 ++-- pycaption/dfxp/extras.py | 117 +- pycaption/geometry.py | 135 +- pycaption/sami.py | 292 ++-- pycaption/scc/__init__.py | 105 +- pycaption/scc/constants.py | 1780 +++++++++++----------- pycaption/scc/specialized_collections.py | 41 +- pycaption/scc/state_machines.py | 2 +- pycaption/srt.py | 50 +- pycaption/transcript.py | 10 +- pycaption/webvtt.py | 157 +- tests/mixins.py | 32 +- tests/samples/dfxp.py | 11 +- tests/samples/sami.py | 36 +- tests/samples/scc.py | 26 +- tests/samples/srt.py | 12 +- tests/samples/webvtt.py | 32 +- tests/test_dfxp.py | 35 +- tests/test_dfxp_conversion.py | 46 +- tests/test_sami.py | 30 +- tests/test_sami_conversion.py | 6 +- tests/test_scc.py | 62 +- tests/test_srt.py | 10 +- tests/test_webvtt.py | 102 +- tox.ini | 2 +- 28 files changed, 1750 insertions(+), 1769 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 4c3ba16d..842817ef 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -43,8 +43,8 @@ master_doc = 'index' # General information about the project. -project = u'pycaption' -copyright = u'2012, PBS.org (available under the Apache License, Version 2.0)' +project = 'pycaption' +copyright = '2012, PBS.org (available under the Apache License, Version 2.0)' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -199,8 +199,8 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'pycaption.tex', u'pycaption Documentation', - u'PBS', 'manual'), + ('index', 'pycaption.tex', 'pycaption Documentation', + 'PBS', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -229,8 +229,8 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'pycaption', u'pycaption Documentation', - [u'PBS'], 1) + ('index', 'pycaption', 'pycaption Documentation', + ['PBS'], 1) ] # If true, show URL addresses after external links. @@ -243,8 +243,8 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'pycaption', u'pycaption Documentation', - u'PBS', 'pycaption', 'One line description of project.', + ('index', 'pycaption', 'pycaption Documentation', + 'PBS', 'pycaption', 'One line description of project.', 'Miscellaneous'), ] diff --git a/pycaption/base.py b/pycaption/base.py index abae4804..b5e30d2a 100644 --- a/pycaption/base.py +++ b/pycaption/base.py @@ -4,14 +4,14 @@ from .exceptions import CaptionReadError, CaptionReadTimingError -DEFAULT_LANGUAGE_CODE = u'en-US' +DEFAULT_LANGUAGE_CODE = 'en-US' def force_byte_string(content): try: - return content.encode(u'UTF-8') + return content.encode('UTF-8') except UnicodeEncodeError: - raise RuntimeError(u'Invalid content encoding') + raise RuntimeError('Invalid content encoding') except UnicodeDecodeError: return content @@ -64,10 +64,11 @@ def __init__(self, relativize=True, video_width=None, video_height=None, converted were made. This is necessary for relativization. :param video_height: The height of the video for which the captions being converted were made. This is necessary for relativization. - :param fit_to_screen: If extent is not set or if origin + extent > 100%, - (re)calculate it based on origin. It is a pycaption fix for caption - files that are technically valid but contains inconsistent settings - that may cause long captions to be cut out of the screen. + :param fit_to_screen: If extent is not set or + if origin + extent > 100%, (re)calculate it based on origin. + It is a pycaption fix for caption files that are technically valid + but contains inconsistent settings that may cause long captions to + be cut out of the screen. """ self.relativize = relativize self.video_width = video_width @@ -130,11 +131,11 @@ def __repr__(self): if t == CaptionNode.TEXT: return repr(self.content) elif t == CaptionNode.BREAK: - return repr(u'BREAK') + return repr('BREAK') elif t == CaptionNode.STYLE: - return repr(u'STYLE: %s %s' % (self.start, self.content)) + return repr('STYLE: %s %s' % (self.start, self.content)) else: - raise RuntimeError(u'Unknown node type: ' + unicode(t)) + raise RuntimeError('Unknown node type: ' + str(t)) @staticmethod def create_text(text, layout_info=None): @@ -175,13 +176,13 @@ def __init__(self, start, end, nodes, style={}, layout_info=None): :type layout_info: Layout """ if not isinstance(start, Number): - raise CaptionReadTimingError(u"Captions must be initialized with a" - u" valid start time") + raise CaptionReadTimingError("Captions must be initialized with a" + " valid start time") if not isinstance(end, Number): - raise CaptionReadTimingError(u"Captions must be initialized with a" - u" valid end time") + raise CaptionReadTimingError("Captions must be initialized with a" + " valid end time") if not nodes: - raise CaptionReadError(u"Node list cannot be empty") + raise CaptionReadError("Node list cannot be empty") self.start = start self.end = end self.nodes = nodes @@ -208,7 +209,7 @@ def format_end(self, msec_separator=None): def __repr__(self): return repr( - u'{start} --> {end}\n{text}'.format( + '{start} --> {end}\n{text}'.format( start=self.format_start(), end=self.format_end(), text=self.get_text() @@ -223,29 +224,29 @@ def get_text_for_node(node): if node.type_ == CaptionNode.TEXT: return node.content if node.type_ == CaptionNode.BREAK: - return u'\n' - return u'' + return '\n' + return '' text_nodes = [get_text_for_node(node) for node in self.nodes] - return u''.join(text_nodes).strip() + return ''.join(text_nodes).strip() def _format_timestamp(self, value, msec_separator=None): datetime_value = timedelta(milliseconds=(int(value / 1000))) str_value = text_type(datetime_value)[:11] if not datetime_value.microseconds: - str_value += u'.000' + str_value += '.000' if msec_separator is not None: - str_value = str_value.replace(u".", msec_separator) + str_value = str_value.replace(".", msec_separator) - return u'0' + str_value + return '0' + str_value class CaptionList(list): """ A list of captions with a layout object attached to it """ def __init__(self, iterable=None, layout_info=None): """ - :param iterator: An iterator used to populate the caption list + :param iterable: An iterator used to populate the caption list :param Layout layout_info: A Layout object with the positioning info """ self.layout_info = layout_info @@ -258,10 +259,9 @@ def __getslice__(self, i, j): def __getitem__(self, y): item = list.__getitem__(self, y) - if isinstance(item, Caption) : + if isinstance(item, Caption): return item - return CaptionList(item - , layout_info=self.layout_info) + return CaptionList(item, layout_info=self.layout_info) def __add__(self, other): add_is_safe = ( @@ -305,7 +305,7 @@ def set_captions(self, lang, captions): self._captions[lang] = captions def get_languages(self): - return self._captions.keys() + return list(self._captions.keys()) def get_captions(self, lang): return self._captions.get(lang, []) @@ -334,7 +334,7 @@ def set_styles(self, styles): def is_empty(self): return all( - [len(captions) == 0 for captions in self._captions.values()] + [len(captions) == 0 for captions in list(self._captions.values())] ) def set_layout_info(self, lang, layout_info): @@ -364,6 +364,7 @@ def adjust_caption_timing(self, offset=0, rate_skew=1.0): out_captions.append(caption) self.set_captions(lang, out_captions) + # Functions def merge_concurrent_captions(caption_set): """Merge captions that have the same start and end times""" @@ -391,6 +392,7 @@ def merge_concurrent_captions(caption_set): caption_set.set_captions(lang, merged_captions) return caption_set + def merge(captions): """ Merge list of captions into one caption. The start/end times from the first diff --git a/pycaption/dfxp/__init__.py b/pycaption/dfxp/__init__.py index a30993e2..0a6ea04f 100644 --- a/pycaption/dfxp/__init__.py +++ b/pycaption/dfxp/__init__.py @@ -1,4 +1,2 @@ -from __future__ import absolute_import - from .base import * from .extras import SinglePositioningDFXPWriter, LegacyDFXPWriter diff --git a/pycaption/dfxp/base.py b/pycaption/dfxp/base.py index d75ddf1b..a8ab7cc2 100644 --- a/pycaption/dfxp/base.py +++ b/pycaption/dfxp/base.py @@ -1,10 +1,7 @@ import re -from builtins import str from copy import deepcopy - -from bs4 import BeautifulSoup, NavigableString from xml.sax.saxutils import escape -import six +from bs4 import BeautifulSoup, NavigableString from ..base import ( BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode, @@ -21,7 +18,7 @@ 'DFXP_DEFAULT_REGION_ID', 'DFXPReader', 'DFXPWriter', 'DFXP_DEFAULT_REGION' ] -DFXP_BASE_MARKUP = u''' +DFXP_BASE_MARKUP = ''' @@ -33,9 +30,9 @@ ''' DFXP_DEFAULT_STYLE = { - u'color': u'white', - u'font-family': u'monospace', - u'font-size': u'1c', + 'color': 'white', + 'font-family': 'monospace', + 'font-size': '1c', } DFXP_DEFAULT_REGION = Layout( @@ -43,13 +40,12 @@ HorizontalAlignmentEnum.CENTER, VerticalAlignmentEnum.BOTTOM) ) -DFXP_DEFAULT_STYLE_ID = u'default' -DFXP_DEFAULT_REGION_ID = u'bottom' +DFXP_DEFAULT_STYLE_ID = 'default' +DFXP_DEFAULT_REGION_ID = 'bottom' class DFXPReader(BaseReader): - def __init__(self, *args, **kw): super(DFXPReader, self).__init__(*args, **kw) self.read_invalid_positioning = ( @@ -57,14 +53,14 @@ def __init__(self, *args, **kw): self.nodes = [] def detect(self, content): - if u'' in content.lower(): + if '' in content.lower(): return True else: return False def read(self, content): - if type(content) != six.text_type: - raise InvalidInputError(u'The content is not a unicode string.') + if type(content) != str: + raise InvalidInputError('The content is not a unicode string.') dfxp_document = self._get_dfxp_parser_class()( content, read_invalid_positioning=self.read_invalid_positioning) @@ -73,25 +69,25 @@ def read(self, content): style_dict = {} # Each div represents all the captions for a single language. - for div in dfxp_document.find_all(u'div'): - lang = div.attrs.get(u'xml:lang', DEFAULT_LANGUAGE_CODE) + for div in dfxp_document.find_all('div'): + lang = div.attrs.get('xml:lang', DEFAULT_LANGUAGE_CODE) caption_dict[lang] = self._translate_div(div) - for style in dfxp_document.find_all(u'style'): - id_ = style.attrs.get(u'xml:id') or style.attrs.get(u'id') + for style in dfxp_document.find_all('style'): + id_ = style.attrs.get('xml:id') or style.attrs.get('id') if id_: # Don't create document styles for those styles that are # descendants of tags. See link: # http://www.w3.org/TR/ttaf1-dfxp/#styling-vocabulary-style - if u'region' not in [ + if 'region' not in [ parent_.name for parent_ in style.parents]: style_dict[id_] = self._translate_style(style) caption_set = CaptionSet(caption_dict, styles=style_dict) if caption_set.is_empty(): - raise CaptionReadNoCaptions(u"empty caption file") + raise CaptionReadNoCaptions("empty caption file") return caption_set @@ -103,7 +99,7 @@ def _get_dfxp_parser_class(): def _translate_div(self, div): return CaptionList( - [self._translate_p_tag(p_tag) for p_tag in div.find_all(u'p')], + [self._translate_p_tag(p_tag) for p_tag in div.find_all('p')], div.layout_info ) @@ -117,26 +113,26 @@ def _translate_p_tag(self, p_tag): start, end, self.nodes, style=styles, layout_info=p_tag.layout_info) def _find_times(self, p_tag): - start = self._translate_time(p_tag[u'begin']) + start = self._translate_time(p_tag['begin']) try: - end = self._translate_time(p_tag[u'end']) + end = self._translate_time(p_tag['end']) except KeyError: - dur = self._translate_time(p_tag[u'dur']) + dur = self._translate_time(p_tag['dur']) end = start + dur return start, end def _translate_time(self, stamp): if stamp[-1].isdigit(): - timesplit = stamp.split(u':') - if u'.' not in timesplit[2]: - timesplit[2] = timesplit[2] + u'.000' - secsplit = timesplit[2].split(u'.') + timesplit = stamp.split(':') + if '.' not in timesplit[2]: + timesplit[2] += '.000' + secsplit = timesplit[2].split('.') if len(timesplit) > 3: secsplit.append((int(timesplit[3]) / 30) * 100) while len(secsplit[1]) < 3: - secsplit[1] += u'0' + secsplit[1] += '0' microseconds = (int(timesplit[0]) * 3600000000 + int(timesplit[1]) * 60000000 + int(secsplit[0]) * 1000000 + @@ -147,16 +143,16 @@ def _translate_time(self, stamp): m = re.search('^([0-9.]+)([a-z]+)$', stamp) value = float(m.group(1)) metric = m.group(2) - if metric == u"h": + if metric == "h": microseconds = value * 60 * 60 * 1000000 - elif metric == u"m": + elif metric == "m": microseconds = value * 60 * 1000000 - elif metric == u"s": + elif metric == "s": microseconds = value * 1000000 - elif metric == u"ms": + elif metric == "ms": microseconds = value * 1000 else: - raise InvalidInputError(u"Unsupported offset-time metric " + metric) + raise InvalidInputError("Unsupported offset-time metric " + metric) return int(microseconds) @@ -164,7 +160,7 @@ def _translate_tag(self, tag): # convert text if isinstance(tag, NavigableString): # strips indentation whitespace only - pattern = re.compile(u"^(?:[\n\r]+\s*)?(.+)") + pattern = re.compile("^(?:[\n\r]+\s*)?(.+)") result = pattern.search(tag) if result: # Escaping/unescaping xml entities is the responsibility of the @@ -177,16 +173,16 @@ def _translate_tag(self, tag): tag_text, layout_info=tag.layout_info) self.nodes.append(node) # convert line breaks - elif tag.name == u'br': + elif tag.name == 'br': self.nodes.append( CaptionNode.create_break(layout_info=tag.layout_info)) # convert italics - elif tag.name == u'span': + elif tag.name == 'span': # convert span self._translate_span(tag) - elif tag.name == u'p' and not tag.contents: + elif tag.name == 'p' and not tag.contents: node = CaptionNode.create_text( - u'', layout_info=tag.layout_info) + '', layout_info=tag.layout_info) self.nodes.append(node) else: # recursively call function for any children elements @@ -200,7 +196,7 @@ def _translate_span(self, tag): # TODO - this is an obvious very old bug. args will be a dictionary. # but since nobody complained, I'll leave it like that. # Happy investigating! - if args != u'': + if args != '': node = CaptionNode.create_style( True, args, layout_info=tag.layout_info) node.start = True @@ -235,38 +231,38 @@ def _translate_style(self, tag): attrs = {} dfxp_attrs = tag.attrs for arg in dfxp_attrs: - if arg.lower() == u"style": + if arg.lower() == "style": # Support multiple classes per tag - attrs[u'classes'] = dfxp_attrs[arg].strip().split(u' ') + attrs['classes'] = dfxp_attrs[arg].strip().split(' ') # Save old class attribute for compatibility - attrs[u'class'] = dfxp_attrs[arg] - elif arg.lower() == u"tts:fontstyle" and dfxp_attrs[arg] == u"italic": - attrs[u'italics'] = True - elif arg.lower() == u"tts:fontweight" and dfxp_attrs[arg] == u"bold": - attrs[u'bold'] = True - elif arg.lower() == u"tts:textdecoration" and u"underline" in dfxp_attrs[arg].strip().split(u" "): - attrs[u'underline'] = True - elif arg.lower() == u"tts:textalign": - attrs[u'text-align'] = dfxp_attrs[arg] - elif arg.lower() == u"tts:fontfamily": - attrs[u'font-family'] = dfxp_attrs[arg] - elif arg.lower() == u"tts:fontsize": - attrs[u'font-size'] = dfxp_attrs[arg] - elif arg.lower() == u"tts:color": - attrs[u'color'] = dfxp_attrs[arg] + attrs['class'] = dfxp_attrs[arg] + elif arg.lower() == "tts:fontstyle" and dfxp_attrs[arg] == "italic": + attrs['italics'] = True + elif arg.lower() == "tts:fontweight" and dfxp_attrs[arg] == "bold": + attrs['bold'] = True + elif arg.lower() == "tts:textdecoration" and "underline" in dfxp_attrs[arg].strip().split(" "): + attrs['underline'] = True + elif arg.lower() == "tts:textalign": + attrs['text-align'] = dfxp_attrs[arg] + elif arg.lower() == "tts:fontfamily": + attrs['font-family'] = dfxp_attrs[arg] + elif arg.lower() == "tts:fontsize": + attrs['font-size'] = dfxp_attrs[arg] + elif arg.lower() == "tts:color": + attrs['color'] = dfxp_attrs[arg] return attrs class DFXPWriter(BaseWriter): def __init__(self, *args, **kwargs): self.write_inline_positioning = kwargs.pop( - u'write_inline_positioning', False) + 'write_inline_positioning', False) self.p_style = False self.open_span = False self.region_creator = None super(DFXPWriter, self).__init__(*args, **kwargs) - def write(self, caption_set, force=u''): + def write(self, caption_set, force=''): """Converts a CaptionSet into an equivalent corresponding DFXP file :type caption_set: pycaption.base.CaptionSet @@ -274,8 +270,8 @@ def write(self, caption_set, force=u''): :rtype: unicode """ - dfxp = BeautifulSoup(DFXP_BASE_MARKUP, u'lxml-xml') - dfxp.find(u'tt')[u'xml:lang'] = u"en" + dfxp = BeautifulSoup(DFXP_BASE_MARKUP, 'lxml-xml') + dfxp.find('tt')['xml:lang'] = "en" langs = caption_set.get_languages() if force in langs: @@ -304,18 +300,18 @@ def write(self, caption_set, force=u''): self.region_creator = self._get_region_creator_class()(dfxp, caption_set) self.region_creator.create_document_regions() - body = dfxp.find(u'body') + body = dfxp.find('body') for lang in langs: - div = dfxp.new_tag(u'div') - div[u'xml:lang'] = six.text_type(lang) + div = dfxp.new_tag('div') + div['xml:lang'] = str(lang) self._assign_positioning_data(div, lang, caption_set) for caption in caption_set.get_captions(lang): if caption.style: caption_style = caption.style else: - caption_style = {u'class': DFXP_DEFAULT_STYLE_ID} + caption_style = {'class': DFXP_DEFAULT_STYLE_ID} p = self._recreate_p_tag( caption, caption_style, dfxp, caption_set, lang) @@ -349,7 +345,7 @@ def _assign_positioning_data(self, tag, lang, caption_set=None, lang, caption_set, caption, caption_node) if assigned_id: - tag[u'region'] = assigned_id + tag['region'] = assigned_id # Write non-standard positioning information if self.write_inline_positioning: @@ -357,16 +353,16 @@ def _assign_positioning_data(self, tag, lang, caption_set=None, def _recreate_styling_tag(self, style, content, dfxp): # TODO - should be drastically simplified: if attributes : append - dfxp_style = dfxp.new_tag(u'style') - dfxp_style.attrs.update({u'xml:id': style}) + dfxp_style = dfxp.new_tag('style') + dfxp_style.attrs.update({'xml:id': style}) attributes = _recreate_style(content, dfxp) dfxp_style.attrs.update(attributes) - new_tag = dfxp.new_tag(u'style') - new_tag.attrs.update({u'xml:id': style}) + new_tag = dfxp.new_tag('style') + new_tag.attrs.update({'xml:id': style}) if dfxp_style != new_tag: - dfxp.find(u'styling').append(dfxp_style) + dfxp.find('styling').append(dfxp_style) return dfxp @@ -374,25 +370,25 @@ def _recreate_p_tag(self, caption, caption_style, dfxp, caption_set=None, lang=None): start = caption.format_start() end = caption.format_end() - p = dfxp.new_tag(u"p", begin=start, end=end) + p = dfxp.new_tag("p", begin=start, end=end) p.string = self._recreate_text(caption, dfxp, caption_set, lang) - if dfxp.find(u"style", {u"xml:id": u"p"}): - p[u'style'] = u'p' + if dfxp.find("style", {"xml:id": "p"}): + p['style'] = 'p' p.attrs.update(_recreate_style(caption_style, dfxp)) return p def _recreate_text(self, caption, dfxp, caption_set=None, lang=None): - line = u'' + line = '' for node in caption.nodes: if node.type_ == CaptionNode.TEXT: line += self._encode(node.content) elif node.type_ == CaptionNode.BREAK: - line = line.rstrip() + u'
\n ' + line = line.rstrip() + '
\n ' elif node.type_ == CaptionNode.STYLE: line = self._recreate_span( @@ -408,34 +404,34 @@ def _recreate_span(self, line, node, dfxp, caption_set=None, caption=None, # We are left with creating tags manually, which is hard to understand # and harder to maintain if node.start: - styles = u'' + styles = '' content_with_style = _recreate_style(node.content, dfxp) - for style, value in content_with_style.items(): - styles += u' %s="%s"' % (style, value) + for style, value in list(content_with_style.items()): + styles += ' %s="%s"' % (style, value) if node.layout_info: region_id, region_attribs = ( self.region_creator.get_positioning_info( lang, caption_set, caption, node )) - styles += u' region="{region_id}"'.format( + styles += ' region="{region_id}"'.format( region_id=region_id) if self.write_inline_positioning: - styles += u' ' + u' '.join( + styles += ' ' + ' '.join( [ - u'{key}="{val}"'.format(key=k_, val=v_) - for k_, v_ in region_attribs.items() + '{key}="{val}"'.format(key=k_, val=v_) + for k_, v_ in list(region_attribs.items()) ] ) if styles: if self.open_span: - line = line.rstrip() + u' ' - line += u'' % styles + line = line.rstrip() + ' ' + line += '' % styles self.open_span = True elif self.open_span: - line = line.rstrip() + u' ' + line = line.rstrip() + ' ' self.open_span = False return line @@ -468,7 +464,7 @@ class LayoutAwareDFXPParser(BeautifulSoup): # to save memory NO_POSITIONING_INFO = None - def __init__(self, markup=u"", features=u"html.parser", builder=None, + def __init__(self, markup="", features="html.parser", builder=None, parse_only=None, from_encoding=None, read_invalid_positioning=False, **kwargs): """The `features` param determines the parser to be used. The parsers @@ -477,9 +473,9 @@ def __init__(self, markup=u"", features=u"html.parser", builder=None, one because even though the docs say it's slower, it's very forgiving (it allows unescaped `<` characters, for example). It doesn't support the `'` entity, however, since it respects the HTML4 and not HTML5 - syntax. Since this is valid XML 1.0, as a workaround we have to manually - replace the every occurance of this entity in the string before using - the parser. + syntax. Since this is valid XML 1.0, as a workaround we have to + manually replace the every occurrence of this entity in the string + before using the parser. The reason why we haven't used the 'xml' parser is that it destroys characters such as < or & (even the escaped ones). @@ -502,14 +498,14 @@ def __init__(self, markup=u"", features=u"html.parser", builder=None, """ # Work around for lack of ''' support in html.parser - markup = markup.replace(u"'", "'") + markup = markup.replace("'", "'") super(LayoutAwareDFXPParser, self).__init__( markup, features, builder, parse_only, from_encoding, **kwargs) self.read_invalid_positioning = read_invalid_positioning - for div in self.find_all(u'div'): + for div in self.find_all('div'): self._pre_order_visit(div) def _pre_order_visit(self, element, inherit_from=None): @@ -544,7 +540,7 @@ def _get_region_from_ancestors(element): region_id = None parent = element.parent while parent: - region_id = parent.get(u'region') + region_id = parent.get('region') if region_id: break parent = parent.parent @@ -564,7 +560,7 @@ def _get_region_from_descendants(element): region_id = None child_region_ids = { - child.get(u'region') for child in element.findChildren() + child.get('region') for child in element.findChildren() } if len(child_region_ids) > 1: raise LookupError @@ -588,8 +584,8 @@ def _determine_region_id(cls, element): # element could be a NavigableString. Those are dumb. region_id = None - if hasattr(element, u'get'): - region_id = element.get(u'region') + if hasattr(element, 'get'): + region_id = element.get('region') if not region_id: region_id = cls._get_region_from_ancestors(element) @@ -616,7 +612,7 @@ def _extract_positioning_information(self, region_id, element): region_tag = None if region_id is not None: - region_tag = self.find(u'region', {u'xml:id': region_id}) + region_tag = self.find('region', {'xml:id': region_id}) region_scraper = ( self._get_layout_info_scraper_class()(self, region_tag)) @@ -656,13 +652,13 @@ def __init__(self, document, region=None): :param region: the region tag """ self.region = region - self._styling_section = document.findChild(u'styling') + self._styling_section = document.findChild('styling') if region: self.region_styles = self._get_style_sources( self._styling_section, region) else: self.region_styles = [] - self.root_element = document.find(u'tt') + self.root_element = document.find('tt') @classmethod def _get_style_sources(cls, styling_section, element): @@ -685,7 +681,7 @@ def _get_style_sources(cls, styling_section, element): styling """ # If we're analyzing a NavigableString, just quit - if not hasattr(element, u'findAll'): + if not hasattr(element, 'findAll'): return () nested_styles = [] @@ -696,19 +692,19 @@ def _get_style_sources(cls, styling_section, element): # if the parent is a
tag. Technically, this step shouldn't be # skipped, but it would make the reader read in O(n^2) (half an hour # for 1500 timed captions) - if element.name not in (u'div', u'body', u'tt'): + if element.name not in ('div', 'body', 'tt'): for style in element.contents: - if getattr(style, u'name', None) == u'style': + if getattr(style, 'name', None) == 'style': nested_styles.extend( cls._get_style_reference_chain(style, styling_section) ) - referenced_style_id = element.get(u'style') + referenced_style_id = element.get('style') referenced_styles = [] if referenced_style_id and styling_section: referenced_style = styling_section.findChild( - u'style', {u'xml:id': referenced_style_id} + 'style', {'xml:id': referenced_style_id} ) referenced_styles = ( @@ -736,11 +732,11 @@ def _get_style_reference_chain(cls, style, styling_tag): if not styling_tag: return result - reference = style.get(u'style') + reference = style.get('style') if reference: referenced_styles = styling_tag.findChildren( - u'style', {u'xml:id': reference} + 'style', {'xml:id': reference} ) if len(referenced_styles) == 1: @@ -749,8 +745,8 @@ def _get_style_reference_chain(cls, style, styling_tag): ) elif len(referenced_styles) > 1: raise CaptionReadSyntaxError( - u"Invalid caption file. " - u"More than 1 style with 'xml:id': {id}" + "Invalid caption file. " + "More than 1 style with 'xml:id': {id}" .format(id=reference) ) @@ -779,36 +775,36 @@ def scrape_positioning_info(self, element=None, even_invalid=False): usable_elem = element if even_invalid else None origin = self._find_attribute( - usable_elem, u'tts:origin', Point.from_xml_attribute, [u'auto'] + usable_elem, 'tts:origin', Point.from_xml_attribute, ['auto'] ) or DFXP_DEFAULT_REGION.origin extent = self._find_attribute( - usable_elem, u'tts:extent', Stretch.from_xml_attribute, [u'auto']) + usable_elem, 'tts:extent', Stretch.from_xml_attribute, ['auto']) if not extent: extent = self._find_root_extent() or DFXP_DEFAULT_REGION.extent padding = self._find_attribute( - usable_elem, u'tts:padding', Padding.from_xml_attribute + usable_elem, 'tts:padding', Padding.from_xml_attribute ) or DFXP_DEFAULT_REGION.padding # tts:textAlign is a special attribute, which can not be ignored when # specified on the element itself (only

nodes matter) # On elements like it is also read, because this was legacy # behavior. - if getattr(element, u'name', None) in (u'span', u'p'): + if getattr(element, 'name', None) in ('span', 'p'): text_align_source = element else: text_align_source = None text_align = ( - self._find_attribute(text_align_source, u'tts:textAlign') + self._find_attribute(text_align_source, 'tts:textAlign') or _create_external_horizontal_alignment( DFXP_DEFAULT_REGION.alignment.horizontal ) ) display_align = ( - self._find_attribute(usable_elem, u'tts:displayAlign') + self._find_attribute(usable_elem, 'tts:displayAlign') or _create_external_vertical_alignment( DFXP_DEFAULT_REGION.alignment.vertical ) @@ -903,16 +899,16 @@ def _find_root_extent(self): if extent is None: root = self.root_element extent = _get_object_from_attribute( - root, u'tts:extent', Stretch.from_xml_attribute + root, 'tts:extent', Stretch.from_xml_attribute ) if extent is not None: if not extent.is_measured_in(UnitEnum.PIXEL): raise CaptionReadSyntaxError( - u"The base element attribute 'tts:extent' should " - u"only be specified in pixels. Check the docs: " - u"http://www.w3.org/TR/ttaf1-dfxp/" - u"#style-attribute-extent" + "The base element attribute 'tts:extent' should " + "only be specified in pixels. Check the docs: " + "http://www.w3.org/TR/ttaf1-dfxp/" + "#style-attribute-extent" ) return extent @@ -997,16 +993,16 @@ def _create_unique_regions(unique_layouts, dfxp, id_factory): :rtype: dict """ region_map = {} - layout_section = dfxp.find(u'layout') + layout_section = dfxp.find('layout') for region_spec in unique_layouts: if ( region_spec.origin or region_spec.extent or region_spec.padding or region_spec.alignment): - new_region = dfxp.new_tag(u'region') + new_region = dfxp.new_tag('region') new_id = id_factory() - new_region[u'xml:id'] = new_id + new_region['xml:id'] = new_id region_map[region_spec] = new_id region_attribs = _convert_layout_to_attributes(region_spec) @@ -1034,12 +1030,12 @@ def create_document_regions(self): self._region_map.update(default_region_map) - def _get_new_id(self, prefix=u'r'): + def _get_new_id(self, prefix='r'): """Return new, unique ids (use an internal counter). :type prefix: unicode """ - new_id = six.text_type((prefix or u'') + six.text_type(self._id_seed)) + new_id = str((prefix or '') + str(self._id_seed)) self._id_seed += 1 return new_id @@ -1099,37 +1095,37 @@ def get_positioning_info( def cleanup_regions(self): """Remove the unused regions from the output file """ - layout_tag = self._dfxp.find(u'layout') + layout_tag = self._dfxp.find('layout') if not layout_tag: return - regions = layout_tag.findChildren(u'region') + regions = layout_tag.findChildren('region') if not regions: return for region in regions: - if region.attrs.get(u'xml:id') not in self._assigned_region_ids: + if region.attrs.get('xml:id') not in self._assigned_region_ids: region.extract() def _recreate_style(content, dfxp): dfxp_style = {} - if u'class' in content: - if dfxp.find(u"style", {u"xml:id": content[u'class']}): - dfxp_style[u'style'] = content[u'class'] - if u'text-align' in content: - dfxp_style[u'tts:textAlign'] = content[u'text-align'] - if u'italics' in content: - dfxp_style[u'tts:fontStyle'] = u'italic' - if u'font-family' in content: - dfxp_style[u'tts:fontFamily'] = content[u'font-family'] - if u'font-size' in content: - dfxp_style[u'tts:fontSize'] = content[u'font-size'] - if u'color' in content: - dfxp_style[u'tts:color'] = content[u'color'] - if u'display-align' in content: - dfxp_style[u'tts:displayAlign'] = content[u'display-align'] + if 'class' in content: + if dfxp.find("style", {"xml:id": content['class']}): + dfxp_style['style'] = content['class'] + if 'text-align' in content: + dfxp_style['tts:textAlign'] = content['text-align'] + if 'italics' in content: + dfxp_style['tts:fontStyle'] = 'italic' + if 'font-family' in content: + dfxp_style['tts:fontFamily'] = content['font-family'] + if 'font-size' in content: + dfxp_style['tts:fontSize'] = content['font-size'] + if 'color' in content: + dfxp_style['tts:color'] = content['color'] + if 'display-align' in content: + dfxp_style['tts:displayAlign'] = content['display-align'] return dfxp_style @@ -1170,15 +1166,15 @@ def _create_external_horizontal_alignment(horizontal_component): result = None if horizontal_component == HorizontalAlignmentEnum.LEFT: - result = u'left' + result = 'left' if horizontal_component == HorizontalAlignmentEnum.CENTER: - result = u'center' + result = 'center' if horizontal_component == HorizontalAlignmentEnum.RIGHT: - result = u'right' + result = 'right' if horizontal_component == HorizontalAlignmentEnum.START: - result = u'start' + result = 'start' if horizontal_component == HorizontalAlignmentEnum.END: - result = u'end' + result = 'end' return result @@ -1193,11 +1189,11 @@ def _create_external_vertical_alignment(vertical_component): result = None if vertical_component == VerticalAlignmentEnum.TOP: - result = u'before' + result = 'before' if vertical_component == VerticalAlignmentEnum.CENTER: - result = u'center' + result = 'center' if vertical_component == VerticalAlignmentEnum.BOTTOM: - result = u'after' + result = 'after' return result @@ -1220,12 +1216,12 @@ def _create_external_alignment(alignment): horizontal_alignment = _create_external_horizontal_alignment( alignment.horizontal) if horizontal_alignment: - result[u'tts:textAlign'] = horizontal_alignment + result['tts:textAlign'] = horizontal_alignment vertical_alignment = _create_external_vertical_alignment( alignment.vertical) if vertical_alignment: - result[u'tts:displayAlign'] = vertical_alignment + result['tts:displayAlign'] = vertical_alignment return result @@ -1243,7 +1239,7 @@ def _get_object_from_attribute(tag, attr_name, factory, :param ignore_vals: iterable of attribute values to ignore :raise CaptionReadSyntaxError: if the attribute has some crazy value """ - if not hasattr(tag, u'has_attr'): + if not hasattr(tag, 'has_attr'): return attr_value = None @@ -1279,18 +1275,18 @@ def _convert_layout_to_attributes(layout): result = {} if not layout: # TODO - change this to actually use the DFXP_DEFAULT_REGION - result[u'tts:textAlign'] = HorizontalAlignmentEnum.CENTER - result[u'tts:displayAlign'] = VerticalAlignmentEnum.BOTTOM + result['tts:textAlign'] = HorizontalAlignmentEnum.CENTER + result['tts:displayAlign'] = VerticalAlignmentEnum.BOTTOM return result if layout.origin: - result[u'tts:origin'] = layout.origin.to_xml_attribute() + result['tts:origin'] = layout.origin.to_xml_attribute() if layout.extent: - result[u'tts:extent'] = layout.extent.to_xml_attribute() + result['tts:extent'] = layout.extent.to_xml_attribute() if layout.padding: - result[u'tts:padding'] = layout.padding.to_xml_attribute() + result['tts:padding'] = layout.padding.to_xml_attribute() if layout.alignment: result.update(_create_external_alignment(layout.alignment)) diff --git a/pycaption/dfxp/extras.py b/pycaption/dfxp/extras.py index 779f1de6..48855559 100644 --- a/pycaption/dfxp/extras.py +++ b/pycaption/dfxp/extras.py @@ -9,7 +9,7 @@ from xml.sax.saxutils import escape from bs4 import BeautifulSoup -LEGACY_DFXP_BASE_MARKUP = u''' +LEGACY_DFXP_BASE_MARKUP = ''' @@ -21,17 +21,17 @@ ''' LEGACY_DFXP_DEFAULT_STYLE = { - u'color': u'white', - u'font-family': u'monospace', - u'font-size': u'1c', + 'color': 'white', + 'font-family': 'monospace', + 'font-size': '1c', } -LEGACY_DFXP_DEFAULT_STYLE_ID = u'default' -LEGACY_DFXP_DEFAULT_REGION_ID = u'bottom' +LEGACY_DFXP_DEFAULT_STYLE_ID = 'default' +LEGACY_DFXP_DEFAULT_REGION_ID = 'bottom' LEGACY_DFXP_DEFAULT_REGION = { - u'text-align': u'center', - u'display-align': u'after' + 'text-align': 'center', + 'display-align': 'after' } @@ -43,7 +43,7 @@ def __init__(self, default_positioning=DFXP_DEFAULT_REGION, super(SinglePositioningDFXPWriter, self).__init__(*args, **kwargs) self.default_positioning = default_positioning - def write(self, captions_set, force=u''): + def write(self, captions_set, force=''): """Writes a DFXP file using the positioning provided in the initializer :type captions_set: pycaption.base.CaptionSet @@ -88,18 +88,19 @@ def _create_single_positioning_caption_set(caption_set, positioning): return caption_set + class LegacyDFXPWriter(BaseWriter): """Ported the legacy DFXPWriter from 0.4.5""" def __init__(self, *args, **kw): self.p_style = False self.open_span = False - def write(self, caption_set, force=u''): + def write(self, caption_set, force=''): caption_set = deepcopy(caption_set) caption_set = merge_concurrent_captions(caption_set) - dfxp = BeautifulSoup(LEGACY_DFXP_BASE_MARKUP, u'lxml-xml') - dfxp.find(u'tt')[u'xml:lang'] = u"en" + dfxp = BeautifulSoup(LEGACY_DFXP_BASE_MARKUP, 'lxml-xml') + dfxp.find('tt')['xml:lang'] = "en" for style_id, style in caption_set.get_styles(): if style != {}: @@ -113,7 +114,7 @@ def write(self, caption_set, force=u''): dfxp = self._recreate_region_tag( LEGACY_DFXP_DEFAULT_REGION_ID, LEGACY_DFXP_DEFAULT_REGION, dfxp) - body = dfxp.find(u'body') + body = dfxp.find('body') if force: langs = [self._force_language(force, caption_set.get_languages())] @@ -121,16 +122,16 @@ def write(self, caption_set, force=u''): langs = caption_set.get_languages() for lang in langs: - div = dfxp.new_tag(u'div') - div[u'xml:lang'] = u'%s' % lang + div = dfxp.new_tag('div') + div['xml:lang'] = '%s' % lang for caption in caption_set.get_captions(lang): if caption.style: caption_style = caption.style - caption_style.update({u'region': LEGACY_DFXP_DEFAULT_REGION_ID}) + caption_style.update({'region': LEGACY_DFXP_DEFAULT_REGION_ID}) else: - caption_style = {u'class': LEGACY_DFXP_DEFAULT_STYLE_ID, - u'region': LEGACY_DFXP_DEFAULT_REGION_ID} + caption_style = {'class': LEGACY_DFXP_DEFAULT_STYLE_ID, + 'region': LEGACY_DFXP_DEFAULT_REGION_ID} p = self._recreate_p_tag(caption, caption_style, dfxp) div.append(p) @@ -148,54 +149,54 @@ def _force_language(self, force, langs): return langs[-1] def _recreate_region_tag(self, region_id, styling, dfxp): - dfxp_region = dfxp.new_tag(u'region') - dfxp_region.attrs.update({u'xml:id': region_id}) + dfxp_region = dfxp.new_tag('region') + dfxp_region.attrs.update({'xml:id': region_id}) attributes = self._recreate_style(styling, dfxp) dfxp_region.attrs.update(attributes) - new_tag = dfxp.new_tag(u'region') - new_tag.attrs.update({u'xml:id': region_id}) + new_tag = dfxp.new_tag('region') + new_tag.attrs.update({'xml:id': region_id}) if dfxp_region != new_tag: - dfxp.find(u'layout').append(dfxp_region) + dfxp.find('layout').append(dfxp_region) return dfxp def _recreate_styling_tag(self, style, content, dfxp): - dfxp_style = dfxp.new_tag(u'style') - dfxp_style.attrs.update({u'xml:id': style}) + dfxp_style = dfxp.new_tag('style') + dfxp_style.attrs.update({'xml:id': style}) attributes = self._recreate_style(content, dfxp) dfxp_style.attrs.update(attributes) - new_tag = dfxp.new_tag(u'style') - new_tag.attrs.update({u'xml:id': style}) + new_tag = dfxp.new_tag('style') + new_tag.attrs.update({'xml:id': style}) if dfxp_style != new_tag: - dfxp.find(u'styling').append(dfxp_style) + dfxp.find('styling').append(dfxp_style) return dfxp def _recreate_p_tag(self, caption, caption_style, dfxp): start = caption.format_start() end = caption.format_end() - p = dfxp.new_tag(u"p", begin=start, end=end) + p = dfxp.new_tag("p", begin=start, end=end) p.string = self._recreate_text(caption, dfxp) - if dfxp.find(u"style", {u"xml:id": u"p"}): - p[u'style'] = u'p' + if dfxp.find("style", {"xml:id": "p"}): + p['style'] = 'p' p.attrs.update(self._recreate_style(caption_style, dfxp)) return p def _recreate_text(self, caption, dfxp): - line = u'' + line = '' for node in caption.nodes: if node.type_ == CaptionNode.TEXT: - line += escape(node.content) + u' ' + line += escape(node.content) + ' ' elif node.type_ == CaptionNode.BREAK: - line = line.rstrip() + u'
\n ' + line = line.rstrip() + '
\n ' elif node.type_ == CaptionNode.STYLE: line = self._recreate_span(line, node, dfxp) @@ -204,20 +205,20 @@ def _recreate_text(self, caption, dfxp): def _recreate_span(self, line, node, dfxp): if node.start: - styles = u'' + styles = '' content_with_style = self._recreate_style(node.content, dfxp) - for style, value in content_with_style.items(): - styles += u' %s="%s"' % (style, value) + for style, value in list(content_with_style.items()): + styles += ' %s="%s"' % (style, value) if styles: if self.open_span: - line = line.rstrip() + u'
' - line += u'' % styles + line = line.rstrip() + ' ' + line += '' % styles self.open_span = True elif self.open_span: - line = line.rstrip() + u' ' + line = line.rstrip() + ' ' self.open_span = False return line @@ -225,23 +226,23 @@ def _recreate_span(self, line, node, dfxp): def _recreate_style(self, content, dfxp): dfxp_style = {} - if u'region' in content: - if dfxp.find(u'region', {u'xml:id': content[u'region']}): - dfxp_style[u'region'] = content[u'region'] - if u'class' in content: - if dfxp.find(u"style", {u"xml:id": content[u'class']}): - dfxp_style[u'style'] = content[u'class'] - if u'text-align' in content: - dfxp_style[u'tts:textAlign'] = content[u'text-align'] - if u'italics' in content: - dfxp_style[u'tts:fontStyle'] = u'italic' - if u'font-family' in content: - dfxp_style[u'tts:fontFamily'] = content[u'font-family'] - if u'font-size' in content: - dfxp_style[u'tts:fontSize'] = content[u'font-size'] - if u'color' in content: - dfxp_style[u'tts:color'] = content[u'color'] - if u'display-align' in content: - dfxp_style[u'tts:displayAlign'] = content[u'display-align'] + if 'region' in content: + if dfxp.find('region', {'xml:id': content['region']}): + dfxp_style['region'] = content['region'] + if 'class' in content: + if dfxp.find("style", {"xml:id": content['class']}): + dfxp_style['style'] = content['class'] + if 'text-align' in content: + dfxp_style['tts:textAlign'] = content['text-align'] + if 'italics' in content: + dfxp_style['tts:fontStyle'] = 'italic' + if 'font-family' in content: + dfxp_style['tts:fontFamily'] = content['font-family'] + if 'font-size' in content: + dfxp_style['tts:fontSize'] = content['font-size'] + if 'color' in content: + dfxp_style['tts:color'] = content['color'] + if 'display-align' in content: + dfxp_style['tts:displayAlign'] = content['display-align'] return dfxp_style diff --git a/pycaption/geometry.py b/pycaption/geometry.py index 8fab090f..a39e9b1b 100644 --- a/pycaption/geometry.py +++ b/pycaption/geometry.py @@ -9,8 +9,8 @@ """ import six -from .exceptions import RelativizationError from enum import Enum +from .exceptions import RelativizationError class UnitEnum(Enum): @@ -22,12 +22,11 @@ class UnitEnum(Enum): if unit == UnitEnum.CELL : ... """ - PIXEL = u'px' - EM = u'em' - PERCENT = u'%' - CELL = u'c' - PT = u'pt' - + PIXEL = 'px' + EM = 'em' + PERCENT = '%' + CELL = 'c' + PT = 'pt' class VerticalAlignmentEnum(Enum): @@ -38,19 +37,19 @@ class VerticalAlignmentEnum(Enum): if alignment == VerticalAlignmentEnum.BOTTOM: ... """ - TOP = u'top' - CENTER = u'center' - BOTTOM = u'bottom' + TOP = 'top' + CENTER = 'center' + BOTTOM = 'bottom' class HorizontalAlignmentEnum(Enum): """Enumeration object specifying the horizontal alignment preferences """ - LEFT = u'left' - CENTER = u'center' - RIGHT = u'right' - START = u'start' - END = u'end' + LEFT = 'left' + CENTER = 'center' + RIGHT = 'right' + START = 'start' + END = 'end' class Alignment(object): @@ -80,7 +79,7 @@ def __eq__(self, other): ) def __repr__(self): - return u"".format( + return "".format( horizontal=self.horizontal, vertical=self.vertical ) @@ -95,22 +94,22 @@ def from_horizontal_and_vertical_align(cls, text_align=None, horizontal_obj = None vertical_obj = None - if text_align == u'left': + if text_align == 'left': horizontal_obj = HorizontalAlignmentEnum.LEFT - if text_align == u'start': + if text_align == 'start': horizontal_obj = HorizontalAlignmentEnum.START - if text_align == u'center': + if text_align == 'center': horizontal_obj = HorizontalAlignmentEnum.CENTER - if text_align == u'right': + if text_align == 'right': horizontal_obj = HorizontalAlignmentEnum.RIGHT - if text_align == u'end': + if text_align == 'end': horizontal_obj = HorizontalAlignmentEnum.END - if display_align == u'before': + if display_align == 'before': vertical_obj = VerticalAlignmentEnum.TOP - if display_align == u'center': + if display_align == 'center': vertical_obj = VerticalAlignmentEnum.CENTER - if display_align == u'after': + if display_align == 'after': vertical_obj = VerticalAlignmentEnum.BOTTOM if not any([horizontal_obj, vertical_obj]): @@ -130,7 +129,7 @@ def from_xml_attribute(cls, attribute): :type attribute: unicode """ - horizontal, vertical = six.text_type(attribute).split(u' ') + horizontal, vertical = six.text_type(attribute).split(' ') horizontal = Size.from_string(horizontal) vertical = Size.from_string(vertical) @@ -150,8 +149,8 @@ def __init__(self, horizontal, vertical): """ for parameter in [horizontal, vertical]: if not isinstance(parameter, Size): - raise ValueError(u"Stretch must be initialized with two valid " - u"Size objects.") + raise ValueError("Stretch must be initialized with two valid " + "Size objects.") self.horizontal = horizontal self.vertical = vertical @@ -167,7 +166,7 @@ def is_measured_in(self, measure_unit): ) def __repr__(self): - return u''.format( + return ''.format( horizontal=self.horizontal, vertical=self.vertical ) @@ -193,13 +192,13 @@ def __hash__(self): 67 ) - def __nonzero__(self): + def __bool__(self): return True if self.horizontal or self.vertical else False def to_xml_attribute(self, **kwargs): """Returns a unicode representation of this object as an xml attribute """ - return u'{horizontal} {vertical}'.format( + return '{horizontal} {vertical}'.format( horizontal=self.horizontal.to_xml_attribute(), vertical=self.vertical.to_xml_attribute() ) @@ -314,8 +313,8 @@ def __init__(self, x, y): """ for parameter in [x, y]: if not isinstance(parameter, Size): - raise ValueError(u"Point must be initialized with two valid " - u"Size objects.") + raise ValueError("Point must be initialized with two valid " + "Size objects.") self.x = x self.y = y @@ -368,7 +367,7 @@ def align_from_origin(cls, p1, p2): Point(max(p1.x, p2.x), max(p1.y, p2.y))) def __repr__(self): - return u''.format( + return ''.format( x=self.x, y=self.y ) @@ -395,13 +394,13 @@ def __hash__(self): 57 ) - def __nonzero__(self): + def __bool__(self): return True if self.x or self.y else False def to_xml_attribute(self, **kwargs): """Returns a unicode representation of this object as an xml attribute """ - return u'{x} {y}'.format( + return '{x} {y}'.format( x=self.x.to_xml_attribute(), y=self.y.to_xml_attribute()) @@ -417,9 +416,9 @@ def __init__(self, value, unit): :param unit: A UnitEnum member """ if value is None: - raise ValueError(u"Size must be initialized with a value.") + raise ValueError("Size must be initialized with a value.") if not isinstance(unit,UnitEnum): - raise ValueError(u"Size must be initialized with a valid unit.") + raise ValueError("Size must be initialized with a valid unit.") self.value = float(value) self.unit = unit @@ -428,16 +427,17 @@ def __sub__(self, other): if self.unit == other.unit: return Size(self.value - other.value, self.unit) else: - raise ValueError(u"The sizes should have the same measure units.") + raise ValueError("The sizes should have the same measure units.") def __abs__(self): return Size(abs(self.value), self.unit) def __cmp__(self, other): if self.unit == other.unit: - return cmp(self.value, other.value) + # python3 does not have cmp + return (self.value > other.value) - (self.value < other.value) else: - raise ValueError(u"The sizes should have the same measure units.") + raise ValueError("The sizes should have the same measure units.") def __lt__(self, other): return self.value < other.value @@ -447,7 +447,7 @@ def __add__(self, other): if self.unit == other.unit: return Size(self.value + other.value, self.unit) else: - raise ValueError(u"The sizes should have the same measure units.") + raise ValueError("The sizes should have the same measure units.") def is_relative(self): """ @@ -469,10 +469,10 @@ def as_percentage_of(self, video_width=None, video_height=None): # The input must be valid so that any conversion can be done if not (video_width or video_height): raise RelativizationError( - u"Either video width or height must be given as a reference") + "Either video width or height must be given as a reference") elif video_width and video_height: raise RelativizationError( - u"Only video width or height can be given as reference") + "Only video width or height can be given as reference") if unit == UnitEnum.EM: # TODO: Implement proper conversion of em in function of font-size @@ -533,31 +533,31 @@ def from_string(cls, string): if value is None: raise ValueError( - u"""Couldn't recognize the value "{value}" as a number""" + """Couldn't recognize the value "{value}" as a number""" .format(value=raw_number) ) instance = cls(value, unit) return instance else: raise ValueError( - u"The specified value is not valid because its unit " - u"is not recognized: {value}. " - u"The only supported units are: {supported}" - .format(value=raw_number, supported=u', '.join(UnitEnum._member_map_)) + "The specified value is not valid because its unit " + "is not recognized: {value}. " + "The only supported units are: {supported}" + .format(value=raw_number, supported=', '.join(UnitEnum._member_map_)) ) def __repr__(self): - return u''.format( + return ''.format( value=self.value, unit=self.unit.value ) def __str__(self): value = round(self.value, 2) if value.is_integer(): - s = u"{}".format(int(value)) + s = "{}".format(int(value)) else: - s = u"{:.2f}".format(value).rstrip('0').rstrip('.') - return u"{}{}".format(s, self.unit.value) + s = "{:.2f}".format(value).rstrip('0').rstrip('.') + return "{}{}".format(s, self.unit.value) def to_xml_attribute(self, **kwargs): """Returns a unicode representation of this object, as an xml attribute @@ -583,7 +583,7 @@ def __hash__(self): 47 ) - def __nonzero__(self): + def __bool__(self): return self.unit in UnitEnum and self.value is not None @@ -628,7 +628,7 @@ def from_xml_attribute(cls, attribute): :param attribute: a string like object, representing a dfxp attr. value :return: a Padding object """ - values_list = six.text_type(attribute).split(u' ') + values_list = six.text_type(attribute).split(' ') sizes = [] for value in values_list: @@ -643,16 +643,16 @@ def from_xml_attribute(cls, attribute): elif len(sizes) == 4: return cls(sizes[0], sizes[2], sizes[3], sizes[1]) else: - raise ValueError(u'The provided value "{value}" could not be ' - u"parsed into the a padding. Check out " - u"http://www.w3.org/TR/ttaf1-dfxp/" - u"#style-attribute-padding for the definition " - u"and examples".format(value=attribute)) + raise ValueError('The provided value "{value}" could not be ' + "parsed into the a padding. Check out " + "http://www.w3.org/TR/ttaf1-dfxp/" + "#style-attribute-padding for the definition " + "and examples".format(value=attribute)) def __repr__(self): return ( - u"".format( + "".format( before=self.before, after=self.after, start=self.start, end=self.end ) @@ -688,7 +688,7 @@ def __hash__(self): ) def to_xml_attribute( - self, attribute_order=(u'before', u'end', u'after', u'start'), + self, attribute_order=('before', 'end', 'after', 'start'), **kwargs): """Returns a unicode representation of this object as an xml attribute @@ -709,9 +709,9 @@ def to_xml_attribute( # A Padding object with attributes set to None is considered # invalid. All four possible paddings must be set. If one of them # is not, this error is raised. - raise ValueError(u"The attribute order specified is invalid.") + raise ValueError("The attribute order specified is invalid.") - return u' '.join(string_list) + return ' '.join(string_list) def as_percentage_of(self, video_width, video_height): return Padding( @@ -733,6 +733,7 @@ def is_relative(self): is_relative &= self.end.is_relative() return is_relative + class Layout(object): """Should encapsulate all the information needed to determine (as correctly as possible) the layout (positioning) of elements on the screen. @@ -780,7 +781,7 @@ def __init__(self, origin=None, extent=None, padding=None, alignment=None, if not attr: setattr(self, attr_name, getattr(inherit_from, attr_name)) - def __nonzero__(self): + def __bool__(self): return any([ self.origin, self.extent, self.padding, self.alignment, self.webvtt_positioning @@ -788,8 +789,8 @@ def __nonzero__(self): def __repr__(self): return ( - u"".format( + "".format( origin=self.origin, extent=self.extent, padding=self.padding, alignment=self.alignment ) diff --git a/pycaption/sami.py b/pycaption/sami.py index 3840a3ef..4911e771 100644 --- a/pycaption/sami.py +++ b/pycaption/sami.py @@ -36,42 +36,33 @@ """ import re - -from collections import deque import six - -try: - from htmlentitydefs import name2codepoint -except: - from html.entities import name2codepoint - -try: - from HTMLParser import HTMLParser, HTMLParseError -except: - from html.parser import HTMLParser - from logging import FATAL +from collections import deque +from copy import deepcopy +from future.backports.html.parser import HTMLParseError + +from html.parser import HTMLParser +from html.entities import name2codepoint from xml.sax.saxutils import escape -from copy import deepcopy, copy -from cssutils import parseString, log, css as cssutils_css + from bs4 import BeautifulSoup, NavigableString +from cssutils import parseString, log, css as cssutils_css from .base import ( BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode, DEFAULT_LANGUAGE_CODE) from .exceptions import ( CaptionReadNoCaptions, CaptionReadSyntaxError, InvalidInputError) -from .geometry import ( - Layout, Alignment, Padding, Size -) +from .geometry import Layout, Alignment, Padding, Size # change cssutils default logging log.setLevel(FATAL) -SAMI_BASE_MARKUP = u''' +SAMI_BASE_MARKUP = ''' @@ -414,7 +414,7 @@ """ -SAMPLE_SAMI_WITH_LANG = u""" +SAMPLE_SAMI_WITH_LANG = """ diff --git a/tests/samples/scc.py b/tests/samples/scc.py index 666d286d..011bf8e8 100644 --- a/tests/samples/scc.py +++ b/tests/samples/scc.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # -SAMPLE_SCC_CREATED_DFXP_WITH_WRONGLY_CLOSING_SPANS = u"""\ +SAMPLE_SCC_CREATED_DFXP_WITH_WRONGLY_CLOSING_SPANS = """\ Scenarist_SCC V1.0 00:01:28;09 9420 942f 94ae 9420 9452 97a2 e3e3 e3e3 e3e3 9470 9723 e3a1 e3a1 @@ -21,13 +21,13 @@ 00:01:59;14 9420 942f 94ae 9420 94f4 6464 6464 """ -SCC_THAT_GENERATES_WEBVTT_WITH_PROPER_NEWLINES = u"""\ +SCC_THAT_GENERATES_WEBVTT_WITH_PROPER_NEWLINES = """\ Scenarist_SCC V1.0 00:21:29;23 9420 9452 6161 94f4 97a2 6262 942c 942f """ -SAMPLE_SCC_PRODUCES_CAPTIONS_WITH_START_AND_END_TIME_THE_SAME = u"""\ +SAMPLE_SCC_PRODUCES_CAPTIONS_WITH_START_AND_END_TIME_THE_SAME = """\ Scenarist_SCC V1.0 00:01:31;18 9420 9454 6162 9758 97a1 91ae 6261 9170 97a1 e362 @@ -37,7 +37,7 @@ 00:01:40;25 942c """ -SAMPLE_SCC_POP_ON = u"""Scenarist_SCC V1.0 +SAMPLE_SCC_POP_ON = """Scenarist_SCC V1.0 00:00:09:05 94ae 94ae 9420 9420 9470 9470 a820 e3ec efe3 6b20 f4e9 e36b e96e 6720 2980 942c 942c 942f 942f @@ -66,7 +66,7 @@ # 2 Roll-Up captions - same comment # 2 Paint-on captions - same comment # - the TAB OVER commands are not interpreted (97A1, 97A2, 9723) -SAMPLE_SCC_MULTIPLE_POSITIONING = u"""Scenarist_SCC V1.0 +SAMPLE_SCC_MULTIPLE_POSITIONING = """Scenarist_SCC V1.0 00:00:00:16 94ae 94ae 9420 9420 1370 1370 6162 6162 91d6 91d6 e364 e364 927c 927c e5e6 e5e6 942c 942c 942f 942f @@ -85,22 +85,22 @@ """ # UNUSED SAMPLE -SAMPLE_SCC_WITH_ITALICS_BKUP = u"""\ +SAMPLE_SCC_WITH_ITALICS_BKUP = """\ Scenarist_SCC V1.0 00:00:00:01 9420 10d0 97a2 91ae 6162 6162 6162 6162 942c 8080 8080 942f """ -SAMPLE_SCC_WITH_ITALICS = u"""\ +SAMPLE_SCC_WITH_ITALICS = """\ 00:00:00:01 9420 10d0 97a2 91ae 6162 6162 6162 6162 942c 8080 8080 942f """ -SAMPLE_SCC_EMPTY = u"""Scenarist_SCC V1.0 +SAMPLE_SCC_EMPTY = """Scenarist_SCC V1.0 """ -SAMPLE_SCC_ROLL_UP_RU2 = u"""\ +SAMPLE_SCC_ROLL_UP_RU2 = """\ Scenarist_SCC V1.0 00:00:00;22 9425 9425 94ad 94ad 9470 9470 3e3e 3e20 c849 ae @@ -136,7 +136,7 @@ """ -SAMPLE_SCC_PRODUCES_BAD_LAST_END_TIME = u"""\ +SAMPLE_SCC_PRODUCES_BAD_LAST_END_TIME = """\ Scenarist_SCC V1.0 00:23:28;01 9420 94ae 9154 5245 91f4 c1c2 942c @@ -148,7 +148,7 @@ 00:54:29;21 942f """ -SAMPLE_NO_POSITIONING_AT_ALL_SCC = u"""\ +SAMPLE_NO_POSITIONING_AT_ALL_SCC = """\ Scenarist_SCC V1.0 00:23:28;01 9420 94ae 5245 c1c2 942c @@ -161,7 +161,7 @@ """ # UNUSED SAMPLE -SAMPLE_SCC_NOT_EXPLICITLY_SWITCHING_ITALICS_OFF = u"""\ +SAMPLE_SCC_NOT_EXPLICITLY_SWITCHING_ITALICS_OFF = """\ Scenarist_SCC V1.0 00:01:28;09 9420 942f 94ae 9420 9452 97a2 b031 6161 9470 9723 b031 6262 @@ -215,7 +215,7 @@ 00:53:03;15 9420 94f4 97a1 94f4 97a1 91ae 31b6 6464 """ -SAMPLE_SCC_NO_EXPLICIT_END_TO_LAST_CAPTION = u"""\ +SAMPLE_SCC_NO_EXPLICIT_END_TO_LAST_CAPTION = """\ Scenarist_SCC V1.0 00:00:00;00 73e9 e329 942f diff --git a/tests/samples/srt.py b/tests/samples/srt.py index e96dfcff..f98b8e96 100644 --- a/tests/samples/srt.py +++ b/tests/samples/srt.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -SAMPLE_SRT = u"""1 +SAMPLE_SRT = """1 00:00:09,209 --> 00:00:12,312 ( clock ticking ) @@ -35,7 +35,7 @@ """ -SAMPLE_SRT_ASCII = u"""1 +SAMPLE_SRT_ASCII = """1 00:00:09,209 --> 00:00:12,312 ( clock ticking ) @@ -74,7 +74,7 @@ some more text """ -SAMPLE_SRT_NUMERIC = u"""35 +SAMPLE_SRT_NUMERIC = """35 00:00:32,290 --> 00:00:32,890 TO FIND HIM. IF @@ -104,10 +104,10 @@ """ -SAMPLE_SRT_EMPTY = u""" +SAMPLE_SRT_EMPTY = """ """ -SAMPLE_SRT_BLANK_LINES = u"""35 +SAMPLE_SRT_BLANK_LINES = """35 00:00:32,290 --> 00:00:32,890 @@ -117,7 +117,7 @@ """ -SAMPLE_SRT_TRAILING_BLANKS = u"""35 +SAMPLE_SRT_TRAILING_BLANKS = """35 00:00:32,290 --> 00:00:32,890 HELP I SAY diff --git a/tests/samples/webvtt.py b/tests/samples/webvtt.py index 634a8997..228897a5 100644 --- a/tests/samples/webvtt.py +++ b/tests/samples/webvtt.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -SAMPLE_WEBVTT = u"""WEBVTT +SAMPLE_WEBVTT = """WEBVTT 00:09.209 --> 00:12.312 ( clock ticking ) @@ -30,7 +30,7 @@ """ -SAMPLE_WEBVTT_FROM_DFXP = u"""WEBVTT +SAMPLE_WEBVTT_FROM_DFXP = """WEBVTT 00:09.209 --> 00:12.312 ( clock ticking ) @@ -63,14 +63,14 @@ SAMPLE_WEBVTT_FROM_SAMI = SAMPLE_WEBVTT_FROM_DFXP -SAMPLE_WEBVTT_FROM_SAMI_WITH_STYLE = u"""WEBVTT +SAMPLE_WEBVTT_FROM_SAMI_WITH_STYLE = """WEBVTT 00:09.209 --> 00:12.312 I do not want to go home. I don't like it there. """ -SAMPLE_WEBVTT_FROM_SAMI_WITH_ID_STYLE = u"""WEBVTT +SAMPLE_WEBVTT_FROM_SAMI_WITH_ID_STYLE = """WEBVTT 00:09.209 --> 00:12.312 This is in italics. @@ -85,13 +85,13 @@ This is everything together. """ -SAMPLE_WEBVTT_FROM_DFXP_WITH_STYLE = u"""WEBVTT +SAMPLE_WEBVTT_FROM_DFXP_WITH_STYLE = """WEBVTT 00:09.209 --> 00:12.312 This is italic, bold, underline, everything together in one tag, and nested. """ -SAMPLE_WEBVTT_FROM_DFXP_WITH_POSITIONING = u"""WEBVTT +SAMPLE_WEBVTT_FROM_DFXP_WITH_POSITIONING = """WEBVTT 00:01.000 --> 00:03.000 position:25%,start line:25% size:50% You might not remember us. We are a typical transparent region with centered text that has an outline. @@ -107,7 +107,7 @@ the last cue """ -SAMPLE_WEBVTT_FROM_DFXP_WITH_POSITIONING_AND_STYLE = u"""WEBVTT +SAMPLE_WEBVTT_FROM_DFXP_WITH_POSITIONING_AND_STYLE = """WEBVTT 00:01.000 --> 00:03.000 position:25%,start line:25% size:50% You might not remember us. We are a typical transparent region with centered text that has an outline. @@ -123,7 +123,7 @@ the last cue """ -SAMPLE_WEBVTT_FROM_SRT = u"""WEBVTT +SAMPLE_WEBVTT_FROM_SRT = """WEBVTT 00:09.209 --> 00:12.312 ( clock ticking ) @@ -158,7 +158,7 @@ # in order to conform to the specification. SAMPLE_WEBVTT_FROM_WEBVTT = SAMPLE_WEBVTT_FROM_SRT -SAMPLE_WEBVTT_2 = u"""WEBVTT +SAMPLE_WEBVTT_2 = """WEBVTT 1 00:00:00.000 --> 00:00:43.000 @@ -189,10 +189,10 @@ HEY. WATCH THIS. """ -SAMPLE_WEBVTT_EMPTY = u"""WEBVTT +SAMPLE_WEBVTT_EMPTY = """WEBVTT """ -SAMPLE_WEBVTT_DOUBLE_BR = u"""WEBVTT +SAMPLE_WEBVTT_DOUBLE_BR = """WEBVTT 00:14.848 --> 00:18.848 MAN: @@ -201,7 +201,7 @@ of "E equals m c-squared", """ -SAMPLE_WEBVTT_OUTPUT_LONG_CUE = u"""WEBVTT +SAMPLE_WEBVTT_OUTPUT_LONG_CUE = """WEBVTT 00:01.000 --> 00:02.000 NARRATOR: @@ -213,7 +213,7 @@ most complex machine in history. """ -WEBVTT_FROM_DFXP_WITH_CONFLICTING_ALIGN = u"""WEBVTT +WEBVTT_FROM_DFXP_WITH_CONFLICTING_ALIGN = """WEBVTT 00:04.537 --> 00:07.841 IT'S WORD GIRL♫ @@ -223,7 +223,7 @@ IT'S WORD GIRL♫ """ -SAMPLE_WEBVTT_WITH_CUE_SETTINGS = u"""\ +SAMPLE_WEBVTT_WITH_CUE_SETTINGS = """\ WEBVTT 00:01.000 --> 00:06.000 align:middle position:37%,start line:74% @@ -233,7 +233,7 @@ They built the largest, """ -SAMPLE_WEBVTT_FROM_SCC_PROPERLY_WRITES_NEWLINES_OUTPUT = u"""\ +SAMPLE_WEBVTT_FROM_SCC_PROPERLY_WRITES_NEWLINES_OUTPUT = """\ WEBVTT 21:30.033 --> 21:34.033 align:left position:12.5%,start line:86.67% size:87.5% @@ -241,7 +241,7 @@ bb """ -SAMPLE_WEBVTT_LAST_CUE_ZERO_START = u"""WEBVTT +SAMPLE_WEBVTT_LAST_CUE_ZERO_START = """WEBVTT 00:00.000 --> 00:12.312 ( clock ticking )""" diff --git a/tests/test_dfxp.py b/tests/test_dfxp.py index 2801b880..62967801 100644 --- a/tests/test_dfxp.py +++ b/tests/test_dfxp.py @@ -17,27 +17,26 @@ def test_detection(self): def test_caption_length(self): captions = DFXPReader().read(SAMPLE_DFXP) - self.assertEqual(7, len(captions.get_captions(u"en-US"))) + self.assertEqual(7, len(captions.get_captions("en-US"))) def test_proper_timestamps(self): captions = DFXPReader().read(SAMPLE_DFXP) - paragraph = captions.get_captions(u"en-US")[2] + paragraph = captions.get_captions("en-US")[2] self.assertEqual(17000000, paragraph.start) self.assertEqual(18752000, paragraph.end) def test_offset_time(self): reader = DFXPReader() - self.assertEquals(1, reader._translate_time(u"0.001ms")) - self.assertEquals(2000, reader._translate_time(u"2ms")) - self.assertEquals(1000000, reader._translate_time(u"1s")) - self.assertEquals(1234567, reader._translate_time(u"1.234567s")) - self.assertEquals(180000000, reader._translate_time(u"3m")) - self.assertEquals(14400000000, reader._translate_time(u"4h")) + self.assertEqual(1, reader._translate_time("0.001ms")) + self.assertEqual(2000, reader._translate_time("2ms")) + self.assertEqual(1000000, reader._translate_time("1s")) + self.assertEqual(1234567, reader._translate_time("1.234567s")) + self.assertEqual(180000000, reader._translate_time("3m")) + self.assertEqual(14400000000, reader._translate_time("4h")) # Tick values are not supported self.assertRaises( - InvalidInputError, reader._translate_time, u"2.3t" - ) + InvalidInputError, reader._translate_time, "2.3t") def test_empty_file(self): self.assertRaises( @@ -46,12 +45,12 @@ def test_empty_file(self): def test_invalid_markup_is_properly_handled(self): captions = DFXPReader().read(SAMPLE_DFXP_SYNTAX_ERROR) - self.assertEquals(2, len(captions.get_captions(u"en-US"))) + self.assertEqual(2, len(captions.get_captions("en-US"))) def test_caption_error_for_invalid_positioning_values(self): invalid_value_dfxp = ( SAMPLE_DFXP_INVALID_POSITIONING_VALUE_TEMPLATE - .format(origin=u"px 5px") + .format(origin="px 5px") ) self.assertRaises( CaptionReadSyntaxError, DFXPReader().read, @@ -61,7 +60,7 @@ def test_caption_error_for_invalid_positioning_values(self): def test_caption_error_for_invalid_or_unsupported_positioning_units(self): invalid_dfxp = ( SAMPLE_DFXP_INVALID_POSITIONING_VALUE_TEMPLATE - .format(origin=u"6foo 7bar") + .format(origin="6foo 7bar") ) self.assertRaises( CaptionReadSyntaxError, DFXPReader().read, @@ -82,9 +81,9 @@ def test_individual_texts_of_captions_with_matching_timespec_are_kept(self): # SAMPLE_DFXP_MULTIPLE_CAPTIONS_WITH_THE_SAME_TIMING ) - expected_texts = [u'Some text here', - u'Some text there', - u'Caption texts are everywhere!'] + expected_texts = ['Some text here', + 'Some text there', + 'Caption texts are everywhere!'] actual_texts = [c_.nodes[0].content for c_ in captionset.get_captions("en-US")] @@ -119,7 +118,7 @@ def test_empty_paragraph(self): self.fail("Failing on empty paragraph") -SAMPLE_DFXP_INVALID_POSITIONING_VALUE_TEMPLATE = u"""\ +SAMPLE_DFXP_INVALID_POSITIONING_VALUE_TEMPLATE = """\ @@ -138,7 +137,7 @@ def test_empty_paragraph(self): # TODO - notice that there's no "bottom" region specified in the # region, but it's referenced by the

. Decide if this is ok enough -SAMPLE_DFXP_MULTIPLE_CAPTIONS_WITH_THE_SAME_TIMING = u"""\ +SAMPLE_DFXP_MULTIPLE_CAPTIONS_WITH_THE_SAME_TIMING = """\ diff --git a/tests/test_dfxp_conversion.py b/tests/test_dfxp_conversion.py index ce80600e..f5935998 100644 --- a/tests/test_dfxp_conversion.py +++ b/tests/test_dfxp_conversion.py @@ -60,43 +60,43 @@ def test_default_styling_tag(self): result = DFXPWriter().write(caption_set) default_style = _recreate_style(DFXP_DEFAULT_STYLE, None) - default_style[u'xml:id'] = DFXP_DEFAULT_STYLE_ID + default_style['xml:id'] = DFXP_DEFAULT_STYLE_ID - soup = BeautifulSoup(result, u'lxml-xml') - style = soup.find(u'style', {u'xml:id': DFXP_DEFAULT_STYLE_ID}) + soup = BeautifulSoup(result, 'lxml-xml') + style = soup.find('style', {'xml:id': DFXP_DEFAULT_STYLE_ID}) self.assertTrue(style) - self.assertEquals(style.attrs, default_style) + self.assertEqual(style.attrs, default_style) def test_default_styling_p_tags(self): caption_set = DFXPReader().read(SAMPLE_DFXP) result = DFXPWriter().write(caption_set) - soup = BeautifulSoup(result, u'lxml') - for p in soup.find_all(u'p'): - self.assertEquals(p.attrs.get(u'style'), 'p') + soup = BeautifulSoup(result, 'lxml') + for p in soup.find_all('p'): + self.assertEqual(p.attrs.get('style'), 'p') def test_default_region_tag(self): caption_set = DFXPReader().read(SAMPLE_DFXP) result = DFXPWriter().write(caption_set) - soup = BeautifulSoup(result, u'lxml-xml') - region = soup.find(u'region', {u'xml:id': DFXP_DEFAULT_REGION_ID}) + soup = BeautifulSoup(result, 'lxml-xml') + region = soup.find('region', {'xml:id': DFXP_DEFAULT_REGION_ID}) default_region = _convert_layout_to_attributes(DFXP_DEFAULT_REGION) - default_region[u'xml:id'] = DFXP_DEFAULT_REGION_ID + default_region['xml:id'] = DFXP_DEFAULT_REGION_ID self.assertTrue(region) - self.assertEqual(region.attrs[u'xml:id'], DFXP_DEFAULT_REGION_ID) + self.assertEqual(region.attrs['xml:id'], DFXP_DEFAULT_REGION_ID) self.assertEqual(region.attrs, default_region) def test_default_region_p_tags(self): caption_set = DFXPReader().read(SAMPLE_DFXP) result = DFXPWriter().write(caption_set) - soup = BeautifulSoup(result, u'lxml') - for p in soup.find_all(u'p'): - self.assertEqual(p.attrs.get(u'region'), DFXP_DEFAULT_REGION_ID) + soup = BeautifulSoup(result, 'lxml') + for p in soup.find_all('p'): + self.assertEqual(p.attrs.get('region'), DFXP_DEFAULT_REGION_ID) def test_correct_region_attributes_are_recreated(self): caption_set = DFXPReader().read(SAMPLE_DFXP_MULTIPLE_REGIONS_INPUT) @@ -160,12 +160,12 @@ def test_fit_to_screen(self): def test_proper_xml_entity_escaping(self): caption_set = DFXPReader().read(DFXP_WITH_ESCAPED_APOSTROPHE) - cue_text = caption_set.get_captions(u'en-US')[0].nodes[0].content + cue_text = caption_set.get_captions('en-US')[0].nodes[0].content self.assertEqual( - cue_text, u"<< \"Andy's Caf\xe9 & Restaurant\" this way") + cue_text, "<< \"Andy's Caf\xe9 & Restaurant\" this way") result = DFXPWriter().write(caption_set) self.assertIn( - u"<< \"Andy's Café & Restaurant\" this way", + "<< \"Andy's Café & Restaurant\" this way", result ) @@ -191,10 +191,10 @@ def test_dfxp_to_sami_with_margins(self): caption_set = DFXPReader().read(SAMPLE_DFXP_FROM_SAMI_WITH_MARGINS) results = SAMIWriter(video_width=VIDEO_WIDTH, video_height=VIDEO_HEIGHT).write(caption_set) - margins = [u"margin-right: 6.04%;", - u"margin-bottom: 0%;", - u"margin-top: 0%;", - u"margin-left: 6.04%;"] + margins = ["margin-right: 6.04%;", + "margin-bottom: 0%;", + "margin-top: 0%;", + "margin-left: 6.04%;"] for margin in margins: self.assertIn(margin, results) @@ -238,7 +238,7 @@ def test_dfxp_to_webvtt_adds_explicit_size(self): caption_set = DFXPReader().read(SAMPLE_DFXP_LONG_CUE) results = WebVTTWriter().write(caption_set) self.assertTrue(isinstance(results, text_type)) - self.assertEquals( + self.assertEqual( SAMPLE_WEBVTT_OUTPUT_LONG_CUE, results) def test_dfxp_to_webvtt_preserves_proper_alignment(self): @@ -247,7 +247,7 @@ def test_dfxp_to_webvtt_preserves_proper_alignment(self): # WebVTTWriter. caption_set = DFXPReader().read(DFXP_STYLE_REGION_ALIGN_CONFLICT) results = WebVTTWriter().write(caption_set) - self.assertEquals( + self.assertEqual( WEBVTT_FROM_DFXP_WITH_CONFLICTING_ALIGN, results) diff --git a/tests/test_sami.py b/tests/test_sami.py index 7f34c625..5b0a0fbe 100644 --- a/tests/test_sami.py +++ b/tests/test_sami.py @@ -19,27 +19,27 @@ def test_detection(self): def test_caption_length(self): captions = SAMIReader().read(SAMPLE_SAMI) - self.assertEqual(7, len(captions.get_captions(u"en-US"))) + self.assertEqual(7, len(captions.get_captions("en-US"))) def test_proper_timestamps(self): captions = SAMIReader().read(SAMPLE_SAMI) - paragraph = captions.get_captions(u"en-US")[2] + paragraph = captions.get_captions("en-US")[2] - self.assertEquals(17000000, paragraph.start) - self.assertEquals(18752000, paragraph.end) + self.assertEqual(17000000, paragraph.start) + self.assertEqual(18752000, paragraph.end) def test_6digit_color_code_from_6digit_input(self): captions = SAMIReader().read(SAMPLE_SAMI) - p_style = captions.get_style(u"p") + p_style = captions.get_style("p") - self.assertEqual(u"#ffeedd", p_style[u'color']) + self.assertEqual("#ffeedd", p_style['color']) def test_6digit_color_code_from_3digit_input(self): captions = SAMIReader().read( - SAMPLE_SAMI.replace(u"#ffeedd", u"#fed")) - p_style = captions.get_style(u"p") + SAMPLE_SAMI.replace("#ffeedd", "#fed")) + p_style = captions.get_style("p") - self.assertEqual(u"#ffeedd", p_style[u'color']) + self.assertEqual("#ffeedd", p_style['color']) def test_empty_file(self): self.assertRaises( @@ -48,7 +48,7 @@ def test_empty_file(self): def test_invalid_markup_is_properly_handled(self): captions = SAMIReader().read(SAMPLE_SAMI_SYNTAX_ERROR) - self.assertEqual(2, len(captions.get_captions(u"en-US"))) + self.assertEqual(2, len(captions.get_captions("en-US"))) def test_partial_margins(self): caption_set = SAMIReader().read(SAMPLE_SAMI_PARTIAL_MARGINS) @@ -56,28 +56,28 @@ def test_partial_margins(self): # (i.e. "0%") self.assertEqual( caption_set.layout_info.padding.to_xml_attribute(), - u'0% 29pt 0% 29pt' + '0% 29pt 0% 29pt' ) def test_sami_with_bad_span_align(self): caption_set = SAMIReader().read(SAMPLE_SAMI_WITH_BAD_SPAN_ALIGN) caption = caption_set.get_captions('en-US')[0] - self.assertEquals(caption.layout_info.alignment.horizontal, HorizontalAlignmentEnum.RIGHT) + self.assertEqual(caption.layout_info.alignment.horizontal, HorizontalAlignmentEnum.RIGHT) def test_sami_with_bad_div_align(self): caption_set = SAMIReader().read(SAMPLE_SAMI_WITH_BAD_DIV_ALIGN) caption = caption_set.get_captions('en-US')[0] - self.assertEquals(caption.layout_info.alignment.horizontal, HorizontalAlignmentEnum.RIGHT) + self.assertEqual(caption.layout_info.alignment.horizontal, HorizontalAlignmentEnum.RIGHT) def test_sami_with_p_align(self): caption_set = SAMIReader().read(SAMPLE_SAMI_WITH_P_ALIGN) caption = caption_set.get_captions('en-US')[0] - self.assertEquals(caption.layout_info.alignment.horizontal, HorizontalAlignmentEnum.RIGHT) + self.assertEqual(caption.layout_info.alignment.horizontal, HorizontalAlignmentEnum.RIGHT) def test_sami_with_p_and_span_align(self): """ align DOES NOT override

align if it is specified inline. """ caption_set = SAMIReader().read(SAMPLE_SAMI_WITH_P_AND_SPAN_ALIGN) caption = caption_set.get_captions('en-US')[0] - self.assertEquals(caption.layout_info.alignment.horizontal, HorizontalAlignmentEnum.RIGHT) + self.assertEqual(caption.layout_info.alignment.horizontal, HorizontalAlignmentEnum.RIGHT) diff --git a/tests/test_sami_conversion.py b/tests/test_sami_conversion.py index 00624801..dc787518 100644 --- a/tests/test_sami_conversion.py +++ b/tests/test_sami_conversion.py @@ -121,9 +121,9 @@ def test_sami_to_dfxp_xml_output(self): results = DFXPWriter(relativize=False, fit_to_screen=False).write(captions) self.assertTrue(isinstance(results, six.text_type)) - self.assertTrue(u'xmlns="http://www.w3.org/ns/ttml"' in results) + self.assertTrue('xmlns="http://www.w3.org/ns/ttml"' in results) self.assertTrue( - u'xmlns:tts="http://www.w3.org/ns/ttml#styling"' in results) + 'xmlns:tts="http://www.w3.org/ns/ttml#styling"' in results) class SAMItoWebVTTTestCase(unittest.TestCase, WebVTTTestingMixIn): @@ -164,4 +164,4 @@ def test_sami_to_sami_conversion(self): results = SAMIWriter().write(caption_set) self.assertTrue(isinstance(results, six.text_type)) self.assertSAMIEquals(SAMPLE_SAMI_WITH_LANG, results) - self.assertTrue(u"lang: en-US;" in results) + self.assertTrue("lang: en-US;" in results) diff --git a/tests/test_scc.py b/tests/test_scc.py index 45f33177..7d474f87 100644 --- a/tests/test_scc.py +++ b/tests/test_scc.py @@ -27,11 +27,11 @@ def test_detection(self): def test_caption_length(self): captions = SCCReader().read(SAMPLE_SCC_POP_ON) - self.assertEqual(7, len(captions.get_captions(u"en-US"))) + self.assertEqual(7, len(captions.get_captions("en-US"))) def test_proper_timestamps(self): captions = SCCReader().read(SAMPLE_SCC_POP_ON) - paragraph = captions.get_captions(u"en-US")[2] + paragraph = captions.get_captions("en-US")[2] delta_start = abs(paragraph.start - 17000000) delta_end = abs(paragraph.end - 18752000) @@ -64,7 +64,7 @@ def test_scc_positioning_is_read(self): ] actual_positioning = [ caption_.layout_info.origin.serialized() for caption_ in - captions.get_captions(u'en-US') + captions.get_captions('en-US') ] self.assertEqual(expected_positioning, actual_positioning) @@ -78,7 +78,7 @@ def test_correct_last_bad_timing(self): (3208266666.666667, 3269700000.0)] actual_timings = [ - (c_.start, c_.end) for c_ in caption_set.get_captions(u'en-US') + (c_.start, c_.end) for c_ in caption_set.get_captions('en-US') ] self.assertEqual(expected_timings, actual_timings) @@ -94,26 +94,26 @@ def switches_italics(node): :rtype: bool """ if not node.type_ == node.STYLE: - raise ValueError(u"This should be a style node.") + raise ValueError("This should be a style node.") return node.start caption_set = SCCReader().read(SAMPLE_SCC_WITH_ITALICS) - nodes = caption_set.get_captions(u'en-US')[0].nodes + nodes = caption_set.get_captions('en-US')[0].nodes # We assert that the text is specified in italics. # If Style nodes are replaced, the way these 3 assertions are made # will most likely change self.assertEqual(switches_italics(nodes[0]), True) self.assertEqual(switches_italics(nodes[2]), False) - self.assertEqual(nodes[1].content, u'abababab') + self.assertEqual(nodes[1].content, 'abababab') def test_default_positioning_when_no_positioning_is_specified(self): caption_set = SCCReader().read(SAMPLE_NO_POSITIONING_AT_ALL_SCC) actual_caption_layouts = [ caption.layout_info.serialized() - for caption in caption_set.get_captions(u'en-US') + for caption in caption_set.get_captions('en-US') ] expected_caption_layouts = [ @@ -145,9 +145,9 @@ def test_timing_is_properly_set_on_split_captions(self): caption_set = SCCReader().read( SAMPLE_SCC_PRODUCES_CAPTIONS_WITH_START_AND_END_TIME_THE_SAME ) - expected_timings = [(u'00:01:35.666', u'00:01:40.866'), - (u'00:01:35.666', u'00:01:40.866'), - (u'00:01:35.666', u'00:01:40.866')] + expected_timings = [('00:01:35.666', '00:01:40.866'), + ('00:01:35.666', '00:01:40.866'), + ('00:01:35.666', '00:01:40.866')] actual_timings = [(c_.format_start(), c_.format_end()) for c_ in caption_set.get_captions('en-US')] @@ -168,37 +168,37 @@ def test_freeze_rollup_captions_contents(self): # There were no tests for ROLL-UP captions, but the library processed # Roll-Up captions. Make sure nothing changes during the refactoring scc1 = SCCReader().read(SAMPLE_SCC_ROLL_UP_RU2) - captions = scc1.get_captions(u'en-US') + captions = scc1.get_captions('en-US') actual_texts = [cap_.nodes[0].content for cap_ in captions] - expected_texts = [u'>>> HI', - u"I'M KEVIN CUNNING AND AT", + expected_texts = ['>>> HI', + "I'M KEVIN CUNNING AND AT", # Notice the missing 'N' at the end. This is because # the input is not OK (should only use 4 byte "words" # (filling in with '80' where only 2 bytes are # meaningful) - u"INVESTOR'S BANK WE BELIEVE I", - u'HELPING THE LOCAL NEIGHBORHOOD', - u'AND IMPROVING THE LIVES OF ALL', - u'WE SERVE', + "INVESTOR'S BANK WE BELIEVE I", + 'HELPING THE LOCAL NEIGHBORHOOD', + 'AND IMPROVING THE LIVES OF ALL', + 'WE SERVE', # special chars. Last one should be printer 2 times # XXX this is a bug. - u'®°½', + '®°½', # special/ extended chars delete last 0-4 chars. # XXX - this is a bug. - u'ABû', - u'ÁÉÓ¡', - u"WHERE YOU'RE STANDING NOW,", - u"LOOKING OUT THERE, THAT'S AL", - u'THE CROWD.', - u'>> IT WAS GOOD TO BE IN TH', - u"And restore Iowa's land, water", - u'And wildlife.', - u'>> Bike Iowa, your source for'] + 'ABû', + 'ÁÉÓ¡', + "WHERE YOU'RE STANDING NOW,", + "LOOKING OUT THERE, THAT'S AL", + 'THE CROWD.', + '>> IT WAS GOOD TO BE IN TH', + "And restore Iowa's land, water", + 'And wildlife.', + '>> Bike Iowa, your source for'] self.assertEqual(expected_texts, actual_texts) def test_freeze_semicolon_spec_time(self): scc1 = SCCReader().read(SAMPLE_SCC_ROLL_UP_RU2) - captions = scc1.get_captions(u'en-US') + captions = scc1.get_captions('en-US') expected_timings = [(766666.6666666667, 2800000.0), (2800000.0, 4600000.0), (4600000.0, 6166666.666666667), @@ -233,7 +233,7 @@ def test_freeze_colon_spec_time(self): (32165466.66666666, 36202833.33333332)] actual_timings = [ - (c_.start, c_.end) for c_ in scc1.get_captions(u'en-US')] + (c_.start, c_.end) for c_ in scc1.get_captions('en-US')] self.assertEqual(expected_timings, actual_timings) @@ -323,7 +323,7 @@ def __init__(self, start=0, end=0, nodes=(1, 2)): self.end = end def __repr__(self): - return u"{start}-->{end}".format(start=self.start, end=self.end) + return "{start}-->{end}".format(start=self.start, end=self.end) class TimingCorrectingCaptionListTestCase(unittest.TestCase): diff --git a/tests/test_srt.py b/tests/test_srt.py index 6e5e2827..37352ade 100644 --- a/tests/test_srt.py +++ b/tests/test_srt.py @@ -15,18 +15,18 @@ def test_detection(self): def test_caption_length(self): captions = SRTReader().read(SAMPLE_SRT) - self.assertEqual(7, len(captions.get_captions(u"en-US"))) + self.assertEqual(7, len(captions.get_captions("en-US"))) def test_proper_timestamps(self): captions = SRTReader().read(SAMPLE_SRT) - paragraph = captions.get_captions(u"en-US")[2] + paragraph = captions.get_captions("en-US")[2] self.assertEqual(17000000, paragraph.start) self.assertEqual(18752000, paragraph.end) def test_numeric_captions(self): captions = SRTReader().read(SAMPLE_SRT_NUMERIC) - self.assertEqual(7, len(captions.get_captions(u"en-US"))) + self.assertEqual(7, len(captions.get_captions("en-US"))) def test_empty_file(self): self.assertRaises( @@ -35,8 +35,8 @@ def test_empty_file(self): def test_extra_empty_line(self): captions = SRTReader().read(SAMPLE_SRT_BLANK_LINES) - self.assertEqual(2, len(captions.get_captions(u"en-US"))) + self.assertEqual(2, len(captions.get_captions("en-US"))) def test_extra_trailing_empty_line(self): captions = SRTReader().read(SAMPLE_SRT_TRAILING_BLANKS) - self.assertEqual(2, len(captions.get_captions(u"en-US"))) + self.assertEqual(2, len(captions.get_captions("en-US"))) diff --git a/tests/test_webvtt.py b/tests/test_webvtt.py index 1a56938d..eef7df6b 100644 --- a/tests/test_webvtt.py +++ b/tests/test_webvtt.py @@ -27,30 +27,30 @@ def test_negative_answer_for_detection(self): def test_caption_length(self): captions = self.reader.read(SAMPLE_WEBVTT_2) - self.assertEqual(len(captions.get_captions(u'en-US')), 7) + self.assertEqual(len(captions.get_captions('en-US')), 7) def test_read_supports_multiple_languages(self): - captions = self.reader.read(SAMPLE_WEBVTT, lang=u'es') - self.assertIsNotNone(captions.get_captions(u'es')) + captions = self.reader.read(SAMPLE_WEBVTT, lang='es') + self.assertIsNotNone(captions.get_captions('es')) def test_proper_timestamps(self): captions = self.reader.read(SAMPLE_WEBVTT) - cue = captions.get_captions(u'en-US')[2] + cue = captions.get_captions('en-US')[2] self.assertEqual(cue.start, 17000000) self.assertEqual(cue.end, 18752000) def test_webvtt_cue_components_removed_from_text(self): result = self.reader._remove_styles( - u"Wikipedia is a great adventure. It may have " - u"its shortcomings, but it is the largest collective " - u"knowledge construction endevour base text " - u"annotation Yes, indeed!" + "Wikipedia is a great adventure. It may have " + "its shortcomings, but it is the largest collective " + "knowledge construction endevour base text " + "annotation Yes, indeed!" ) expected = ( - u"Wikipedia is a great adventure. It may have " - u"its shortcomings, but it is the largest collective " - u"knowledge construction endevour base text annotation" - u" Audry: Yes, indeed!" + "Wikipedia is a great adventure. It may have " + "its shortcomings, but it is the largest collective " + "knowledge construction endevour base text annotation" + " Audry: Yes, indeed!" ) self.assertEqual(result, expected) @@ -63,26 +63,26 @@ def test_not_ignoring_timing_errors(self): self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, - (u"\n" - u"00:00:20.000 --> 00:00:10.000\n" - u"foo bar baz") + ("\n" + "00:00:20.000 --> 00:00:10.000\n" + "foo bar baz") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, - (u"00:00:20.000 --> 00:00:10.000\n" - u"Start time is greater than end time.\n") + ("00:00:20.000 --> 00:00:10.000\n" + "Start time is greater than end time.\n") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, - (u"00:00:20.000 --> 00:00:30.000\n" - u"Start times should be consecutive.\n" - u"\n" - u"00:00:10.000 --> 00:00:20.000\n" - u"This cue starts before the previous one.\n") + ("00:00:20.000 --> 00:00:30.000\n" + "Start times should be consecutive.\n" + "\n" + "00:00:10.000 --> 00:00:20.000\n" + "This cue starts before the previous one.\n") ) def test_ignoring_timing_errors(self): @@ -90,72 +90,72 @@ def test_ignoring_timing_errors(self): self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, - (u"\nNOTE invalid cue stamp\n" - u"00:00:20.000 --> \n" - u"foo bar baz\n") + ("\nNOTE invalid cue stamp\n" + "00:00:20.000 --> \n" + "foo bar baz\n") ) # And this too self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, - (u"\n00:00:20,000 --> 00:00:22,000\n" - u"Note the comma instead of point.\n") + ("\n00:00:20,000 --> 00:00:22,000\n" + "Note the comma instead of point.\n") ) try: WebVTTReader().read( - (u"\n" - u"00:00:20.000 --> 00:00:10.000\n" - u"Start time is greater than end time.\n") + ("\n" + "00:00:20.000 --> 00:00:10.000\n" + "Start time is greater than end time.\n") ) except CaptionReadError: - self.fail(u"Shouldn't raise CaptionReadError") + self.fail("Shouldn't raise CaptionReadError") try: WebVTTReader().read( - (u"\n" - u"00:00:20.000 --> 00:00:30.000\n" - u"Start times should be consecutive.\n" - u"\n" - u"00:00:10.000 --> 00:00:20.000\n" - u"This cue starts before the previous one.\n") + ("\n" + "00:00:20.000 --> 00:00:30.000\n" + "Start times should be consecutive.\n" + "\n" + "00:00:10.000 --> 00:00:20.000\n" + "This cue starts before the previous one.\n") ) except CaptionReadError: - self.fail(u"Shouldn't raise CaptionReadError") + self.fail("Shouldn't raise CaptionReadError") def test_invalid_files(self): self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, - (u"\nNOTE Cues without text are invalid.\n" - u"00:00:20.000 --> 00:00:30.000\n" - u"\n" - u"00:00:40.000 --> 00:00:50.000\n" - u"foo bar baz\n") + ("\nNOTE Cues without text are invalid.\n" + "00:00:20.000 --> 00:00:30.000\n" + "\n" + "00:00:40.000 --> 00:00:50.000\n" + "foo bar baz\n") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, - (u"00:00:20.000 --> 00:00:10.000\n" - u"Start time is greater than end time.") + ("00:00:20.000 --> 00:00:10.000\n" + "Start time is greater than end time.") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, - (u"00:00:20.000 --> 00:00:30.000\n" - u"Start times should be consecutive.\n" - u"\n" - u"00:00:10.000 --> 00:00:20.000\n" - u"This cue starts before the previous one.\n") + ("00:00:20.000 --> 00:00:30.000\n" + "Start times should be consecutive.\n" + "\n" + "00:00:10.000 --> 00:00:20.000\n" + "This cue starts before the previous one.\n") ) def test_zero_start(self): captions = self.reader.read(SAMPLE_WEBVTT_LAST_CUE_ZERO_START) - cue = captions.get_captions(u'en-US')[0] + cue = captions.get_captions('en-US')[0] self.assertEqual(cue.start, 0) diff --git a/tox.ini b/tox.ini index 6623fd82..3a47fbcb 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py27, py34, py35 +envlist = py34, py35 [testenv] deps= beautifulsoup4==4.4