#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
# Time-stamp: <2015-07-15 10:30:26 (kthoden)>

__doc__="""ding"""

import sys
import re
from StringIO import StringIO
import codecs

output_filename = '01_cleanedURL.xml'

# zwei Inputs
command_line_argument = sys.argv[-1]

# read input text as XML
text = codecs.open(command_line_argument, 'r', "utf-8").read()
#with codecs.open(command_line_argument, encoding="utf-8") as myfile:
#with open(command_line_argument) as myfile:
#    data="".join(line.rstrip() for line in myfile)

# remove schema location
text = re.sub(r'(xsi:schemaLocation="http://www.mpiwg-berlin.mpg.de/ns/echo/1.0/ .*?/echo.xsd ")','',text)
# yay negative lookahead assertion, replace the ampersand only if it is not followed by amp;
text = re.sub(r'&(?!amp;)','&amp;',text)
# out = re.sub(u"’","&#x2019;",text)

out = text
output_file = codecs.open(output_filename, 'w', 'utf-8')
output_file.write(out)
print("wrote to output file %s" % output_filename)
output_file.close
