# HG changeset patch # User Dirk Wintergruen # Date 1408450372 -7200 # Node ID 5e33fa5a2fdc01b3f27f9567bf2caeff249b948e initial diff -r 000000000000 -r 5e33fa5a2fdc .hgignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgignore Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,3 @@ + +syntax: regexp +^data$ \ No newline at end of file diff -r 000000000000 -r 5e33fa5a2fdc .project --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.project Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,17 @@ + + + ocrHandling + + + + + + org.python.pydev.PyDevBuilder + + + + + + org.python.pydev.pythonNature + + diff -r 000000000000 -r 5e33fa5a2fdc .pydevproject --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.pydevproject Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,8 @@ + + + +/${PROJECT_DIR_NAME} + +python 3.0 +python + diff -r 000000000000 -r 5e33fa5a2fdc .settings/org.eclipse.core.resources.prefs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.settings/org.eclipse.core.resources.prefs Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,3 @@ +eclipse.preferences.version=1 +encoding//data/OrdnerMusterJPG/pages/canon_01.txt=ISO-8859-1 +encoding/=UTF-8 diff -r 000000000000 -r 5e33fa5a2fdc copyFiles.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/copyFiles.py Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,38 @@ +from _csv import Error +start ="/Volumes/Folivora/MPG_Archiv/" +target = "/Volumes/MPGARCHIV/input/" + + +import os +import shutil + +errorf = open("/tmp/error.out","w",encoding="utf-8") +errorf.write("START") + +for root, dirs, files in os.walk(start, topdown=False): + for name in files: + path = os.path.join(root, name) + ext = os.path.splitext(path) + print (name) + if ext[1] == ".pdf": + + fld = os.path.split(root) + print (fld) + neu = os.path.join(target,fld[1]+"___"+name.replace(" ","_")) + if not os.path.exists(neu): + try: + shutil.copy(path, neu) + except: + errorf.write(path+"\n") + print (path) + else: + print("EXISTS:" + path) + +errorf.close(); + + + + #for name in dirs: + # print(os.path.join(root, name)) + + \ No newline at end of file diff -r 000000000000 -r 5e33fa5a2fdc copyFiles2.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/copyFiles2.py Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,48 @@ +from _csv import Error +start ="/Volumes/Folivora/MPG_Archiv/" +target = "/Volumes/MPGARCHIV/struct2/" + + +import os +import os.path +import shutil + +errorf = open("/tmp/error.out","w",encoding="utf-8") +errorf.write("START") + +for root, dirs, files in os.walk(start, topdown=False): + for name in files: + path = os.path.join(root, name) + ext = os.path.splitext(path) + print (name) + if ext[1] == ".pdf": + + fld = os.path.split(root) + print (fld) + neu = os.path.join(target,fld[1]+"___"+name.replace(" ","_")) + + pathNeu,ext=os.path.splitext(neu) + + if not os.path.exists(pathNeu): + + os.makedirs(pathNeu) + os.makedirs(pathNeu+"/full") + + try: + neu = pathNeu+"/full/"+name.replace(" ","_") + shutil.copy(path, neu) + except: + errorf.write(path+"\n") + + print (path) + else: + print("EXISTS:" + path) + +errorf.close(); + + + + #for name in dirs: + # print(os.path.join(root, name)) + + \ No newline at end of file diff -r 000000000000 -r 5e33fa5a2fdc createIndexMetas.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/createIndexMetas.py Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,77 @@ +import os +import os.path + +from lxml import etree + +class indexMetaHandler: + def createInitialIndexMetas(self,path,start=None,end=None): + cnt=-1 + for fn in os.listdir(path): + cnt+=1 + + if start is not None: + if cntend: + return + print("PROCESS: %s"%fn) + + self.createInitialIndexMeta(path,fn) + + def createInitialIndexMeta(self,path,fn,remove="/Volumes/MPIWG"): + + dom=etree.parse("index.meta.stub.xml") #read the stub + + element = dom.xpath("/resource/name") + element[0].text=fn + + element = dom.xpath("/resource/archive-path") + archivePath=path+fn + + element[0].text=archivePath.replace(remove,"") + + #check images + + if os.path.exists(path+fn+"/pages"): + element = dom.xpath("/resource/meta/texttool/image") + element[0].text="pages" + + element = dom.xpath("/resource/meta/texttool")[0] + if os.path.exists(path+fn+"/full"): + fullElement = etree.Element("text",type="pdf") + fullElement.text="full" + + element.append(fullElement) + + if os.path.exists(path+fn+"/hocr"): + fullElement = etree.Element("text",type="hocr") + fullElement.text="hocr" + + element.append(fullElement) + + + + element = dom.xpath("/resource/meta/bib/title") + element[0].text=fn + + + txt=etree.tostring(dom, pretty_print=True) + + out = open(path+fn+"/index.meta","bw") + + out.write(txt) + + out.close() + + + + + + + +ih = indexMetaHandler() + +ih.createInitialIndexMetas("/Volumes/MPGARCHIV2/struct2/") \ No newline at end of file diff -r 000000000000 -r 5e33fa5a2fdc doOCR.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doOCR.py Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,108 @@ +import shlex +import subprocess +import os.path +import os + +class ocrManager: + + #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath + + def split(self,foldername): + """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf""" + + filename=foldername.split("/")[-1].split("___")[1] + + + + outfolder = foldername + "/pages/" + + if not os.path.exists(outfolder): + os.mkdir(outfolder) + + outName = outfolder + "img" + "-\%03d.png" + #outName = outfolder + filename + "-%03d.png" + strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName + #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName + + listArguments = shlex.split(strCommand) + #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName] + exeShell = subprocess.call(listArguments) + + + + while False: + line = exeShell.stdout.readline() + if line != b'': + #the real code does filtering here + print (line.rstrip()) + else: + break + + def ocr(self,foldername): + filename=foldername.split("/")[-1] + + #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr + + + pagesfolder = foldername + "/pages/" + hocrfolder = foldername + "/hocr/" + + if not os.path.exists(hocrfolder): + os.mkdir(hocrfolder) + + + + for fn in os.listdir(pagesfolder): + + outName = hocrfolder + fn + + + strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr" + + listArguments = shlex.split(strCommand) + exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE) + + while True: + line = exeShell.stdout.readline() + if line != b'': + #the real code does filtering here + print (line.rstrip()) + else: + break + + + +import sys +om = ocrManager() + +args = sys.argv + +start=None +end=None +if len(args) > 1: #start foldernummer + start = int(args[1]) + +if len(args) > 2: #start foldernummer + end = int(args[2]) + + +foldername = "/Volumes/MPGARCHIV/struct2/" + +cnt=-1 +for fn in os.listdir(foldername): + cnt+=1 + + if start is not None: + if cntend: + sys.exit() + print("PROCESS: %s"%fn) + + + + om.split(foldername+fn) + om.ocr(foldername+fn) diff -r 000000000000 -r 5e33fa5a2fdc index.meta.stub.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/index.meta.stub.xml Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,22 @@ + + + + +mpiwg + + + +scanned document + + + + + + + + + + + + +