#!/usr/bin/env python2.6
# -*- coding: utf-8 -*-
# Time-stamp: <2013-01-09 18:53:03 (kthoden)>

__doc__="""Script to deal with things that are not supported right now
in the schema, but are used by the Harriot online project.

In addition to that, this script takes pieces out of the XML which are
used right now in Eclipse but are not valid. For example, the schema
location pointing to a local directory is removed.

Also, with the current state of the editor, the xml-namespace before
the space-attribute is not inserted automatically. This script puts it
in.
 """

import codecs
import re
import sys
import os
# specify the path to the share directory
sys.path.append(os.environ["ECHO_SCRIPTS_DIR"]+"/share")
import common_functions

common_functions.consoleStatus("Starting conversion to valid XML")

# input text is last argument
input_text = sys.argv[-1]
text = codecs.open(input_text, 'r', "utf-8").read()

# the latex2mathml script crashes the encoding
mathmlreplacements = {
u"Ã¦" : u"æ", 
u"Ã©" : u"é",
u"Ã¨" : u"è",
u"Ã " : u"à",
u"Ã§" : u"ç",
u"Ã\x86" : u"Æ",
u"â\x80¦" : u"…",
u"Ã\x97" : u"\xd7",             # times sign
u"â\x88£" : u"|",
u"â\x80\x93" : u"–",
u"Â±" : u"±",
u"Â£" : u"£",
u"Ã" : u"ß",
u"â\x80\x98" : u"\x2018",                # LEFT SINGLE QUOTATION MARK
u"â\x80\x99" : u"\x2019",        # SINGLE COMMA QUOTATION MARK
u"É\x9b" : u"ɛ",
u"Î±" : u"α",
u"Î²" : u"β",
u"Î³" : u"γ",
u"Ï\x83" : u"σ",
u"Ï\x88" : u"ψ",
u"Î½" : u"ν",
u"Ï\x85" : u"υ"
}


# Replace things
# editor issues: things that occur while working with eclipse
text = re.sub(r'(xsi:schemaLocation="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/ .*?/echo.xsd ")','',text) # remove schema location
text = re.sub('(?<= )space="preserve"','xml:space="preserve"',text) # add xml: prefix for the time being
text = re.sub('<echo:','<',text)         # should not occur in the future
text = re.sub('</echo:','</',text)         # should not occur in the future
# schema issues. Some tags are not yet supported with the current schema.
#text = re.sub('style="edins"', 'style="super"', text ) # Not yet supported by schema
text = re.sub(r'\^(\S+)', r'<emph style="super">\1</emph>', text ) # shorthand for insertion
text = re.sub('style="ins"', 'style="super"', text ) # Not yet supported by schema
text = re.sub(r'<unsure>(.*?)<\/unsure>', ur'[¿]\1[?]',text)
text = re.sub(r'<unsure/>', ur'[¿?]',text)
text = re.sub('<illegible/>','[???]', text) # Not yet supported by schema
text = re.sub('<omission/>','<lb/>[...]<lb/>',text) # Not yet supported by schema
# translations and commentaries will stretch over several lines. Here are compiled regular expressions which take care of that
pattern_edins = re.compile('<emph style="edins">(.*?)</emph>', re.DOTALL)
pattern_page_com = re.compile('(<c\s.*?>[\n]?)(.*?)([\n]?<\/c>)', re.DOTALL)
pattern_par_com = re.compile('(<pc\s.*?>[\n]?)(.*?)([\n]?<\/pc>)', re.DOTALL)
pattern_semunit_com = re.compile('(<sc\s.*?>[\n]?)(.*?)([\n]?<\/sc>)', re.DOTALL)
# pattern_translation = re.compile('(<t[^ext].*?>)(.*?)(<\/t>)', re.DOTALL)
pattern_translation = re.compile('(<t\s.*?>[\n]?)(.*?)([\n]?<\/t>)', re.DOTALL)

text = re.sub(pattern_edins,r'[\1]', text ) # Not yet supported by schema

# as of 2012-03-19, an invalid ECHO document is able to be uploaded
# In that case, it would be nice also to display commentaries
# text = re.sub(pattern_page_com,r"<!-- \1\2\3 -->",text)
# text = re.sub(pattern_par_com,r"<!-- \1\2\3 -->",text)
# text = re.sub(pattern_semunit_com,r"<!-- \1\2\3 -->",text)
#text = re.sub(pattern_translation,r"<!-- \1\2\3 -->",text)
# jackie now produces long passages of texts and uses paragraphs by herself
#text = re.sub(pattern_page_com,r'<div type="page_commentary" level="0" n="0">\n<p>\n<s xml:space="preserve">[<emph style="it">Note: </emph>\2]\n<lb/><lb/></s></p></div>',text) # some more space between page commentary and text
text = re.sub(pattern_page_com,r'<div type="page_commentary" level="0" n="0">\n<p>\n<s xml:space="preserve">[<emph style="it">Note: </emph></s></p>\2]\n<lb/><lb/></div>',text) # some more space between page commentary and text
text = re.sub(pattern_par_com,r' <s xml:space="preserve">[<emph style="it">Note:</emph> \2]</s><lb/>',text)
text = re.sub(pattern_semunit_com,r'[<emph style="it">Note:</emph> \2]<lb/>',text)
text = re.sub(pattern_translation,r'<lb/>[<emph style="it">tr:</emph> \2]<lb/>',text)
#text = re.sub(pattern_translation,r'[<emph style="it">\2</emph>]',text)

for thing in mathmlreplacements.keys():
    text = text.replace(thing, mathmlreplacements[thing])

# this is somehow not working
# also, the language attribute is doubled up
# pattern_lang = re.compile('(<.*?xml:lang.*?)(\slang="[a-z]{,3}")(.*?>)', re.DOTALL)
# text = re.sub(pattern_lang,r'\1\3',text)

# output
common_functions.printInOutputTextFile(text)
common_functions.consoleStatus("Conversion to valid XML finished")

# the end
