Mercurial > hg > ocrHandling
changeset 0:5e33fa5a2fdc
initial
author | Dirk Wintergruen <dwinter@mpiwg-berlin.mpg.de> |
---|---|
date | Tue, 19 Aug 2014 14:12:52 +0200 |
parents | |
children | 5b7ed10ecbb4 |
files | .hgignore .project .pydevproject .settings/org.eclipse.core.resources.prefs copyFiles.py copyFiles2.py createIndexMetas.py doOCR.py index.meta.stub.xml |
diffstat | 9 files changed, 324 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgignore Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,3 @@ + +syntax: regexp +^data$ \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.project Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,17 @@ +<?xml version="1.0" encoding="UTF-8"?> +<projectDescription> + <name>ocrHandling</name> + <comment></comment> + <projects> + </projects> + <buildSpec> + <buildCommand> + <name>org.python.pydev.PyDevBuilder</name> + <arguments> + </arguments> + </buildCommand> + </buildSpec> + <natures> + <nature>org.python.pydev.pythonNature</nature> + </natures> +</projectDescription>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.pydevproject Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<?eclipse-pydev version="1.0"?><pydev_project> +<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH"> +<path>/${PROJECT_DIR_NAME}</path> +</pydev_pathproperty> +<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 3.0</pydev_property> +<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">python</pydev_property> +</pydev_project>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.settings/org.eclipse.core.resources.prefs Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,3 @@ +eclipse.preferences.version=1 +encoding//data/OrdnerMusterJPG/pages/canon_01.txt=ISO-8859-1 +encoding/<project>=UTF-8
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/copyFiles.py Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,38 @@ +from _csv import Error +start ="/Volumes/Folivora/MPG_Archiv/" +target = "/Volumes/MPGARCHIV/input/" + + +import os +import shutil + +errorf = open("/tmp/error.out","w",encoding="utf-8") +errorf.write("START") + +for root, dirs, files in os.walk(start, topdown=False): + for name in files: + path = os.path.join(root, name) + ext = os.path.splitext(path) + print (name) + if ext[1] == ".pdf": + + fld = os.path.split(root) + print (fld) + neu = os.path.join(target,fld[1]+"___"+name.replace(" ","_")) + if not os.path.exists(neu): + try: + shutil.copy(path, neu) + except: + errorf.write(path+"\n") + print (path) + else: + print("EXISTS:" + path) + +errorf.close(); + + + + #for name in dirs: + # print(os.path.join(root, name)) + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/copyFiles2.py Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,48 @@ +from _csv import Error +start ="/Volumes/Folivora/MPG_Archiv/" +target = "/Volumes/MPGARCHIV/struct2/" + + +import os +import os.path +import shutil + +errorf = open("/tmp/error.out","w",encoding="utf-8") +errorf.write("START") + +for root, dirs, files in os.walk(start, topdown=False): + for name in files: + path = os.path.join(root, name) + ext = os.path.splitext(path) + print (name) + if ext[1] == ".pdf": + + fld = os.path.split(root) + print (fld) + neu = os.path.join(target,fld[1]+"___"+name.replace(" ","_")) + + pathNeu,ext=os.path.splitext(neu) + + if not os.path.exists(pathNeu): + + os.makedirs(pathNeu) + os.makedirs(pathNeu+"/full") + + try: + neu = pathNeu+"/full/"+name.replace(" ","_") + shutil.copy(path, neu) + except: + errorf.write(path+"\n") + + print (path) + else: + print("EXISTS:" + path) + +errorf.close(); + + + + #for name in dirs: + # print(os.path.join(root, name)) + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/createIndexMetas.py Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,77 @@ +import os +import os.path + +from lxml import etree + +class indexMetaHandler: + def createInitialIndexMetas(self,path,start=None,end=None): + cnt=-1 + for fn in os.listdir(path): + cnt+=1 + + if start is not None: + if cnt<start: + print("SKIP: %s"%fn) + continue + + if end is not None: + if cnt>end: + return + print("PROCESS: %s"%fn) + + self.createInitialIndexMeta(path,fn) + + def createInitialIndexMeta(self,path,fn,remove="/Volumes/MPIWG"): + + dom=etree.parse("index.meta.stub.xml") #read the stub + + element = dom.xpath("/resource/name") + element[0].text=fn + + element = dom.xpath("/resource/archive-path") + archivePath=path+fn + + element[0].text=archivePath.replace(remove,"") + + #check images + + if os.path.exists(path+fn+"/pages"): + element = dom.xpath("/resource/meta/texttool/image") + element[0].text="pages" + + element = dom.xpath("/resource/meta/texttool")[0] + if os.path.exists(path+fn+"/full"): + fullElement = etree.Element("text",type="pdf") + fullElement.text="full" + + element.append(fullElement) + + if os.path.exists(path+fn+"/hocr"): + fullElement = etree.Element("text",type="hocr") + fullElement.text="hocr" + + element.append(fullElement) + + + + element = dom.xpath("/resource/meta/bib/title") + element[0].text=fn + + + txt=etree.tostring(dom, pretty_print=True) + + out = open(path+fn+"/index.meta","bw") + + out.write(txt) + + out.close() + + + + + + + +ih = indexMetaHandler() + +ih.createInitialIndexMetas("/Volumes/MPGARCHIV2/struct2/") \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doOCR.py Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,108 @@ +import shlex +import subprocess +import os.path +import os + +class ocrManager: + + #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath + + def split(self,foldername): + """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf""" + + filename=foldername.split("/")[-1].split("___")[1] + + + + outfolder = foldername + "/pages/" + + if not os.path.exists(outfolder): + os.mkdir(outfolder) + + outName = outfolder + "img" + "-\%03d.png" + #outName = outfolder + filename + "-%03d.png" + strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName + #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName + + listArguments = shlex.split(strCommand) + #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName] + exeShell = subprocess.call(listArguments) + + + + while False: + line = exeShell.stdout.readline() + if line != b'': + #the real code does filtering here + print (line.rstrip()) + else: + break + + def ocr(self,foldername): + filename=foldername.split("/")[-1] + + #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr + + + pagesfolder = foldername + "/pages/" + hocrfolder = foldername + "/hocr/" + + if not os.path.exists(hocrfolder): + os.mkdir(hocrfolder) + + + + for fn in os.listdir(pagesfolder): + + outName = hocrfolder + fn + + + strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr" + + listArguments = shlex.split(strCommand) + exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE) + + while True: + line = exeShell.stdout.readline() + if line != b'': + #the real code does filtering here + print (line.rstrip()) + else: + break + + + +import sys +om = ocrManager() + +args = sys.argv + +start=None +end=None +if len(args) > 1: #start foldernummer + start = int(args[1]) + +if len(args) > 2: #start foldernummer + end = int(args[2]) + + +foldername = "/Volumes/MPGARCHIV/struct2/" + +cnt=-1 +for fn in os.listdir(foldername): + cnt+=1 + + if start is not None: + if cnt<start: + print("SKIP: %s"%fn) + continue + + if end is not None: + if cnt>end: + sys.exit() + print("PROCESS: %s"%fn) + + + + om.split(foldername+fn) + om.ocr(foldername+fn)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/index.meta.stub.xml Tue Aug 19 14:12:52 2014 +0200 @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="UTF-8"?><resource version="1.1" type="MPIWG"> +<name></name> +<archive-path></archive-path> +<archive-creation-date></archive-creation-date> +<creator>mpiwg</creator> +<description></description> +<media-type></media-type> +<meta> +<content-type>scanned document</content-type> +<access-conditions> +<access type="mpiwg"/> +</access-conditions> +<texttool> +<image></image> +</texttool> +<bib type="manuscript"> +<author></author> +<title></title> +</bib> +<dri type=""></dri> +</meta> +</resource>