#!/usr/bin/env python2.6 # -*- coding: utf-8 -*- # Time-stamp: <2013-01-09 18:53:03 (kthoden)> __doc__="""Script to deal with things that are not supported right now in the schema, but are used by the Harriot online project. In addition to that, this script takes pieces out of the XML which are used right now in Eclipse but are not valid. For example, the schema location pointing to a local directory is removed. Also, with the current state of the editor, the xml-namespace before the space-attribute is not inserted automatically. This script puts it in. """ import codecs import re import sys import os # specify the path to the share directory sys.path.append(os.environ["ECHO_SCRIPTS_DIR"]+"/share") import common_functions common_functions.consoleStatus("Starting conversion to valid XML") # input text is last argument input_text = sys.argv[-1] text = codecs.open(input_text, 'r', "utf-8").read() # the latex2mathml script crashes the encoding mathmlreplacements = { u"Ã¦" : u"æ", u"Ã©" : u"é", u"Ã¨" : u"è", u"Ã " : u"à", u"Ã§" : u"ç", u"Ã\x86" : u"Æ", u"â\x80¦" : u"…", u"Ã\x97" : u"\xd7", # times sign u"â\x88£" : u"|", u"â\x80\x93" : u"–", u"Â±" : u"±", u"Â£" : u"£", u"Ã" : u"ß", u"â\x80\x98" : u"\x2018", # LEFT SINGLE QUOTATION MARK u"â\x80\x99" : u"\x2019", # SINGLE COMMA QUOTATION MARK u"É\x9b" : u"ɛ", u"Î±" : u"α", u"Î²" : u"β", u"Î³" : u"γ", u"Ï\x83" : u"σ", u"Ï\x88" : u"ψ", u"Î½" : u"ν", u"Ï\x85" : u"υ" } # Replace things # editor issues: things that occur while working with eclipse text = re.sub(r'(xsi:schemaLocation="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/ .*?/echo.xsd ")','',text) # remove schema location text = re.sub('(?<= )space="preserve"','xml:space="preserve"',text) # add xml: prefix for the time being text = re.sub('\1', text ) # shorthand for insertion text = re.sub('style="ins"', 'style="super"', text ) # Not yet supported by schema text = re.sub(r'(.*?)<\/unsure>', ur'[¿]\1[?]',text) text = re.sub(r'', ur'[¿?]',text) text = re.sub('','[???]', text) # Not yet supported by schema text = re.sub('','[...]',text) # Not yet supported by schema # translations and commentaries will stretch over several lines. Here are compiled regular expressions which take care of that pattern_edins = re.compile('(.*?)', re.DOTALL) pattern_page_com = re.compile('([\n]?)(.*?)([\n]?<\/c>)', re.DOTALL) pattern_par_com = re.compile('([\n]?)(.*?)([\n]?<\/pc>)', re.DOTALL) pattern_semunit_com = re.compile('([\n]?)(.*?)([\n]?<\/sc>)', re.DOTALL) # pattern_translation = re.compile('()(.*?)(<\/t>)', re.DOTALL) pattern_translation = re.compile('([\n]?)(.*?)([\n]?<\/t>)', re.DOTALL) text = re.sub(pattern_edins,r'[\1]', text ) # Not yet supported by schema # as of 2012-03-19, an invalid ECHO document is able to be uploaded # In that case, it would be nice also to display commentaries # text = re.sub(pattern_page_com,r"",text) # text = re.sub(pattern_par_com,r"",text) # text = re.sub(pattern_semunit_com,r"",text) #text = re.sub(pattern_translation,r"",text) # jackie now produces long passages of texts and uses paragraphs by herself #text = re.sub(pattern_page_com,r'

\n~~[Note: \2]\n~~

',text) # some more space between page commentary and text text = re.sub(pattern_page_com,r'

\n~~[Note:~~

\2]\n

',text) # some more space between page commentary and text text = re.sub(pattern_par_com,r' ~~[Note: \2]~~',text) text = re.sub(pattern_semunit_com,r'[Note: \2]',text) text = re.sub(pattern_translation,r'[tr: \2]',text) #text = re.sub(pattern_translation,r'[\2]',text) for thing in mathmlreplacements.keys(): text = text.replace(thing, mathmlreplacements[thing]) # this is somehow not working # also, the language attribute is doubled up # pattern_lang = re.compile('(<.*?xml:lang.*?)(\slang="[a-z]{,3}")(.*?>)', re.DOTALL) # text = re.sub(pattern_lang,r'\1\3',text) # output common_functions.printInOutputTextFile(text) common_functions.consoleStatus("Conversion to valid XML finished") # the end