# HG changeset patch # User dwinter # Date 1408451076 -7200 # Node ID 5b7ed10ecbb426f02b3c78769c3acc2258f56d23 # Parent 5e33fa5a2fdc01b3f27f9567bf2caeff249b948e initial2 diff -r 5e33fa5a2fdc -r 5b7ed10ecbb4 createIndexMetas.py --- a/createIndexMetas.py Tue Aug 19 14:12:52 2014 +0200 +++ b/createIndexMetas.py Tue Aug 19 14:24:36 2014 +0200 @@ -21,7 +21,7 @@ self.createInitialIndexMeta(path,fn) - def createInitialIndexMeta(self,path,fn,remove="/Volumes/MPIWG"): + def createInitialIndexMeta(self,path,fn,remove="/Volumes/MPGARCHIV"): dom=etree.parse("index.meta.stub.xml") #read the stub @@ -74,4 +74,4 @@ ih = indexMetaHandler() -ih.createInitialIndexMetas("/Volumes/MPGARCHIV2/struct2/") \ No newline at end of file +ih.createInitialIndexMetas("/Volumes/MPGARCHIV/struct2/") \ No newline at end of file diff -r 5e33fa5a2fdc -r 5b7ed10ecbb4 doOCR.py --- a/doOCR.py Tue Aug 19 14:12:52 2014 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,108 +0,0 @@ -import shlex -import subprocess -import os.path -import os - -class ocrManager: - - #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath - - def split(self,foldername): - """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf""" - - filename=foldername.split("/")[-1].split("___")[1] - - - - outfolder = foldername + "/pages/" - - if not os.path.exists(outfolder): - os.mkdir(outfolder) - - outName = outfolder + "img" + "-\%03d.png" - #outName = outfolder + filename + "-%03d.png" - strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName - #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName - - listArguments = shlex.split(strCommand) - #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName] - exeShell = subprocess.call(listArguments) - - - - while False: - line = exeShell.stdout.readline() - if line != b'': - #the real code does filtering here - print (line.rstrip()) - else: - break - - def ocr(self,foldername): - filename=foldername.split("/")[-1] - - #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr - - - pagesfolder = foldername + "/pages/" - hocrfolder = foldername + "/hocr/" - - if not os.path.exists(hocrfolder): - os.mkdir(hocrfolder) - - - - for fn in os.listdir(pagesfolder): - - outName = hocrfolder + fn - - - strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr" - - listArguments = shlex.split(strCommand) - exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE) - - while True: - line = exeShell.stdout.readline() - if line != b'': - #the real code does filtering here - print (line.rstrip()) - else: - break - - - -import sys -om = ocrManager() - -args = sys.argv - -start=None -end=None -if len(args) > 1: #start foldernummer - start = int(args[1]) - -if len(args) > 2: #start foldernummer - end = int(args[2]) - - -foldername = "/Volumes/MPGARCHIV/struct2/" - -cnt=-1 -for fn in os.listdir(foldername): - cnt+=1 - - if start is not None: - if cntend: - sys.exit() - print("PROCESS: %s"%fn) - - - - om.split(foldername+fn) - om.ocr(foldername+fn) diff -r 5e33fa5a2fdc -r 5b7ed10ecbb4 doOCR2.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doOCR2.py Tue Aug 19 14:24:36 2014 +0200 @@ -0,0 +1,105 @@ +import shlex +import subprocess +import os.path +import os + +class ocrManager: + + #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath + + def split(self,foldername,onExistExit=False): + """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf""" + + filename=foldername.split("/")[-1].split("___")[1] + + + + outfolder = foldername + "/pages/" + + if not os.path.exists(outfolder): + os.mkdir(outfolder) + else: + if onExistExit: #if set don't do anything + print (" ---- exists (split)") + return + + outName = outfolder + "img" + "-\%03d.png" + #outName = outfolder + filename + "-%03d.png" + strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName + #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName + + listArguments = shlex.split(strCommand) + #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName] + exeShell = subprocess.call(listArguments) + + + + while False: + line = exeShell.stdout.readline() + if line != b'': + #the real code does filtering here + print (line.rstrip()) + else: + break + + def ocr(self,foldername,onExistExit=False): + filename=foldername.split("/")[-1] + + #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr + + + pagesfolder = foldername + "/pages/" + hocrfolder = foldername + "/hocr/" + + if not os.path.exists(hocrfolder): + os.mkdir(hocrfolder) + else: + if onExistExit: #if set don't do anything + print (" ---- exists (ocrj)") + return + + + + + for fn in os.listdir(pagesfolder): + + outName = hocrfolder + fn + + + strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr" + + listArguments = shlex.split(strCommand) + exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE) + + while True: + line = exeShell.stdout.readline() + if line != b'': + #the real code does filtering here + print (line.rstrip()) + else: + break + + + +import sys +om = ocrManager() + +args = sys.argv + +start=None +end=None +if len(args) > 1: #start foldernummer + start = int(args[1]) + +if len(args) > 2: #start foldernummer + end = int(args[2]) + + +foldername = "/Volumes/MPGARCHIV/struct2/" + +cnt=-1 + + + + +om.ocr("/Volumes/MPGARCHIV/israel",onExistExit=True) diff -r 5e33fa5a2fdc -r 5b7ed10ecbb4 doOCR_old.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doOCR_old.py Tue Aug 19 14:24:36 2014 +0200 @@ -0,0 +1,51 @@ +from _csv import Error +#start ="/Volumes/Folivora/MPG_Archiv/" +start ="/Volumes/Folivora/MPG_Archiv/Publikationen der MPG" +target = "/Volumes/MPGARCHIV/struct2/" + +pathAdd = True #add the path of the file to the filename + +import os +import os.path +import shutil + +errorf = open("/tmp/error.out","w",encoding="utf-8") +errorf.write("START") + +for root, dirs, files in os.walk(start, topdown=False): + for name in files: + path = os.path.join(root, name) + ext = os.path.splitext(path) + print (name) + if ext[1] == ".pdf": + if pathAdd: + fld = os.path.split(root) + print (fld) + neu = os.path.join(target,fld[1].replace(" ","_")+"___"+name.replace(" ","_")) + else: + neu = os.path.join(target,name.replace(" ","_")) + pathNeu,ext=os.path.splitext(neu) + + if not os.path.exists(pathNeu): + + os.makedirs(pathNeu) + os.makedirs(pathNeu+"/full") + + try: + neu = pathNeu+"/full/"+name.replace(" ","_") + shutil.copy(path, neu) + except: + errorf.write(path+"\n") + + print (path) + else: + print("EXISTS:" + path) + +errorf.close(); + + + + #for name in dirs: + # print(os.path.join(root, name)) + + \ No newline at end of file diff -r 5e33fa5a2fdc -r 5b7ed10ecbb4 doOCR_tmp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doOCR_tmp.py Tue Aug 19 14:24:36 2014 +0200 @@ -0,0 +1,119 @@ +import shlex +import subprocess +import os.path +import os + +class ocrManager: + + #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath + + def split(self,foldername,onExistExit=False): + """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf""" + + filename=foldername.split("/")[-1].split("___")[1] + + + + outfolder = foldername + "/pages/" + + if not os.path.exists(outfolder): + os.mkdir(outfolder) + else: + if onExistExit: #if set don't do anything + print (" ---- exists (split)") + return + + outName = outfolder + "img" + "-\%03d.png" + #outName = outfolder + filename + "-%03d.png" + strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName + #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName + + listArguments = shlex.split(strCommand) + #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName] + exeShell = subprocess.call(listArguments) + + + + while False: + line = exeShell.stdout.readline() + if line != b'': + #the real code does filtering here + print (line.rstrip()) + else: + break + + def ocr(self,foldername,onExistExit=False): + filename=foldername.split("/")[-1] + + #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr + + + pagesfolder = foldername + "/pages/" + hocrfolder = foldername + "/hocr/" + + if not os.path.exists(hocrfolder): + os.mkdir(hocrfolder) + else: + if onExistExit: #if set don't do anything + print (" ---- exists (ocrj)") + return + + + + + for fn in os.listdir(pagesfolder): + + outName = hocrfolder + fn + + + strCommand = "/usr/local/bin/tesseract -l deu " + pagesfolder + fn +" " + hocrfolder + fn + " hocr" + + listArguments = shlex.split(strCommand) + exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE) + + while True: + line = exeShell.stdout.readline() + if line != b'': + #the real code does filtering here + print (line.rstrip()) + else: + break + + + +import sys +om = ocrManager() + +args = sys.argv + +start=None +end=None +if len(args) > 1: #start foldernummer + start = int(args[1]) + +if len(args) > 2: #start foldernummer + end = int(args[2]) + + +foldername = "/Volumes/MPGARCHIV/struct2/" + +cnt=-1 +for fn in os.listdir(foldername): + cnt+=1 + if fn.find("1953-1974")<0: + print ("DON't do:"+fn) + continue + if start is not None: + if cntend: + sys.exit() + print("PROCESS: %s"%fn) + + + + om.split(foldername+fn,onExistExit=False) + om.ocr(foldername+fn,onExistExit=False)