Mercurial > hg > ocrHandling

--- a/createIndexMetas.py	Tue Aug 19 14:12:52 2014 +0200
+++ b/createIndexMetas.py	Tue Aug 19 14:24:36 2014 +0200
@@ -21,7 +21,7 @@

             self.createInitialIndexMeta(path,fn)

-    def createInitialIndexMeta(self,path,fn,remove="/Volumes/MPIWG"):
+    def createInitialIndexMeta(self,path,fn,remove="/Volumes/MPGARCHIV"):

         dom=etree.parse("index.meta.stub.xml") #read the stub

@@ -74,4 +74,4 @@

 ih = indexMetaHandler()

-ih.createInitialIndexMetas("/Volumes/MPGARCHIV2/struct2/")
\ No newline at end of file
+ih.createInitialIndexMetas("/Volumes/MPGARCHIV/struct2/")
\ No newline at end of file
--- a/doOCR.py	Tue Aug 19 14:12:52 2014 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,108 +0,0 @@
-import shlex
-import subprocess
-import os.path
-import os
-
-class ocrManager:
-
-    #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath
-
-    def split(self,foldername):
-        """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf"""
-
-        filename=foldername.split("/")[-1].split("___")[1]
-
-
-
-        outfolder = foldername + "/pages/"
-
-        if not os.path.exists(outfolder):
-            os.mkdir(outfolder)
-
-        outName = outfolder +  "img" + "-\%03d.png"
-        #outName = outfolder +  filename + "-%03d.png"
-        strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
-        #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
-
-        listArguments = shlex.split(strCommand)
-        #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName]
-        exeShell = subprocess.call(listArguments)
-
-
-
-        while False:
-            line = exeShell.stdout.readline()
-            if line != b'':
-                #the real code does filtering here
-                print (line.rstrip())
-            else:
-                break
-
-    def ocr(self,foldername):
-        filename=foldername.split("/")[-1]
-
-        #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr
-
-
-        pagesfolder = foldername + "/pages/"
-        hocrfolder = foldername + "/hocr/"
-
-        if not os.path.exists(hocrfolder):
-            os.mkdir(hocrfolder)
-
-
-
-        for fn in os.listdir(pagesfolder):
-
-            outName = hocrfolder +  fn
-
-
-            strCommand = "/usr/local/bin/tesseract -l deu  " + pagesfolder + fn +"  " + hocrfolder + fn + " hocr"
-
-            listArguments = shlex.split(strCommand)
-            exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE)
-
-            while True:
-                line = exeShell.stdout.readline()
-                if line != b'':
-                    #the real code does filtering here
-                    print (line.rstrip())
-                else:
-                    break
-
-
-
-import sys
-om = ocrManager()
-
-args = sys.argv
-
-start=None
-end=None
-if len(args) > 1: #start foldernummer
-    start = int(args[1])
-
-if len(args) > 2: #start foldernummer
-    end = int(args[2])
-
-
-foldername = "/Volumes/MPGARCHIV/struct2/"
-
-cnt=-1
-for fn in os.listdir(foldername):
-    cnt+=1
-
-    if start is not None:
-        if cnt<start:
-            print("SKIP: %s"%fn)
-            continue
-
-    if end is not None:
-        if cnt>end:
-            sys.exit()
-    print("PROCESS: %s"%fn)
-
-
-
-    om.split(foldername+fn)
-    om.ocr(foldername+fn)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doOCR2.py	Tue Aug 19 14:24:36 2014 +0200
@@ -0,0 +1,105 @@
+import shlex
+import subprocess
+import os.path
+import os
+
+class ocrManager:
+
+    #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath
+
+    def split(self,foldername,onExistExit=False):
+        """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf"""
+
+        filename=foldername.split("/")[-1].split("___")[1]
+
+
+
+        outfolder = foldername + "/pages/"
+
+        if not os.path.exists(outfolder):
+            os.mkdir(outfolder)
+        else:
+            if onExistExit: #if set don't do anything
+                print (" ---- exists (split)")
+                return
+
+        outName = outfolder +  "img" + "-\%03d.png"
+        #outName = outfolder +  filename + "-%03d.png"
+        strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
+        #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
+
+        listArguments = shlex.split(strCommand)
+        #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName]
+        exeShell = subprocess.call(listArguments)
+
+
+
+        while False:
+            line = exeShell.stdout.readline()
+            if line != b'':
+                #the real code does filtering here
+                print (line.rstrip())
+            else:
+                break
+
+    def ocr(self,foldername,onExistExit=False):
+        filename=foldername.split("/")[-1]
+
+        #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr
+
+
+        pagesfolder = foldername + "/pages/"
+        hocrfolder = foldername + "/hocr/"
+
+        if not os.path.exists(hocrfolder):
+            os.mkdir(hocrfolder)
+        else:
+            if onExistExit: #if set don't do anything
+                print (" ---- exists (ocrj)")
+                return
+
+
+
+
+        for fn in os.listdir(pagesfolder):
+
+            outName = hocrfolder +  fn
+
+
+            strCommand = "/usr/local/bin/tesseract -l deu  " + pagesfolder + fn +"  " + hocrfolder + fn + " hocr"
+
+            listArguments = shlex.split(strCommand)
+            exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE)
+
+            while True:
+                line = exeShell.stdout.readline()
+                if line != b'':
+                    #the real code does filtering here
+                    print (line.rstrip())
+                else:
+                    break
+
+
+
+import sys
+om = ocrManager()
+
+args = sys.argv
+
+start=None
+end=None
+if len(args) > 1: #start foldernummer
+    start = int(args[1])
+
+if len(args) > 2: #start foldernummer
+    end = int(args[2])
+
+
+foldername = "/Volumes/MPGARCHIV/struct2/"
+
+cnt=-1
+
+
+
+
+om.ocr("/Volumes/MPGARCHIV/israel",onExistExit=True)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doOCR_old.py	Tue Aug 19 14:24:36 2014 +0200
@@ -0,0 +1,51 @@
+from _csv import Error
+#start ="/Volumes/Folivora/MPG_Archiv/"
+start ="/Volumes/Folivora/MPG_Archiv/Publikationen der MPG"
+target = "/Volumes/MPGARCHIV/struct2/"
+
+pathAdd = True #add the path of the file to the filename
+
+import os
+import os.path
+import shutil
+
+errorf = open("/tmp/error.out","w",encoding="utf-8")
+errorf.write("START")
+
+for root, dirs, files in os.walk(start, topdown=False):
+    for name in files:
+        path = os.path.join(root, name)
+        ext = os.path.splitext(path)
+        print (name)
+        if ext[1] == ".pdf":
+            if pathAdd:
+                fld = os.path.split(root)
+                print (fld)
+                neu = os.path.join(target,fld[1].replace(" ","_")+"___"+name.replace(" ","_"))
+            else:
+                neu = os.path.join(target,name.replace(" ","_"))
+            pathNeu,ext=os.path.splitext(neu)
+
+            if not os.path.exists(pathNeu):
+
+                os.makedirs(pathNeu)
+                os.makedirs(pathNeu+"/full")
+
+                try:
+                    neu = pathNeu+"/full/"+name.replace(" ","_")
+                    shutil.copy(path, neu)
+                except:
+                    errorf.write(path+"\n")
+
+                print (path)
+            else:
+                print("EXISTS:" + path)
+
+errorf.close();
+
+
+
+    #for name in dirs:
+    #    print(os.path.join(root, name))
+
+        
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doOCR_tmp.py	Tue Aug 19 14:24:36 2014 +0200
@@ -0,0 +1,119 @@
+import shlex
+import subprocess
+import os.path
+import os
+
+class ocrManager:
+
+    #strCommand = "/opt/local/bin/gm convert " + strImagepath + " -resize 1500x\\> " + strImagepath
+
+    def split(self,foldername,onExistExit=False):
+        """splits pdf file in foldername, default it assumes a folder full containing a file foldername.pdf"""
+
+        filename=foldername.split("/")[-1].split("___")[1]
+
+
+
+        outfolder = foldername + "/pages/"
+
+        if not os.path.exists(outfolder):
+            os.mkdir(outfolder)
+        else:
+            if onExistExit: #if set don't do anything
+                print (" ---- exists (split)")
+                return
+
+        outName = outfolder +  "img" + "-\%03d.png"
+        #outName = outfolder +  filename + "-%03d.png"
+        strCommand = "/usr/local/bin/convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
+        #strCommand = "/usr/local/bin/gm convert -verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName
+
+        listArguments = shlex.split(strCommand)
+        #listArguments = ["/usr/local/bin/gm","convert", "-verbose -density 300 " + foldername + "/full/" + filename +".pdf " + outName]
+        exeShell = subprocess.call(listArguments)
+
+
+
+        while False:
+            line = exeShell.stdout.readline()
+            if line != b'':
+                #the real code does filtering here
+                print (line.rstrip())
+            else:
+                break
+
+    def ocr(self,foldername,onExistExit=False):
+        filename=foldername.split("/")[-1]
+
+        #tesseract -l deu JB1953-17802.png JB1953-17802.tess hocr
+
+
+        pagesfolder = foldername + "/pages/"
+        hocrfolder = foldername + "/hocr/"
+
+        if not os.path.exists(hocrfolder):
+            os.mkdir(hocrfolder)
+        else:
+            if onExistExit: #if set don't do anything
+                print (" ---- exists (ocrj)")
+                return
+
+
+
+
+        for fn in os.listdir(pagesfolder):
+
+            outName = hocrfolder +  fn
+
+
+            strCommand = "/usr/local/bin/tesseract -l deu  " + pagesfolder + fn +"  " + hocrfolder + fn + " hocr"
+
+            listArguments = shlex.split(strCommand)
+            exeShell = subprocess.Popen(listArguments,stdout=subprocess.PIPE)
+
+            while True:
+                line = exeShell.stdout.readline()
+                if line != b'':
+                    #the real code does filtering here
+                    print (line.rstrip())
+                else:
+                    break
+
+
+
+import sys
+om = ocrManager()
+
+args = sys.argv
+
+start=None
+end=None
+if len(args) > 1: #start foldernummer
+    start = int(args[1])
+
+if len(args) > 2: #start foldernummer
+    end = int(args[2])
+
+
+foldername = "/Volumes/MPGARCHIV/struct2/"
+
+cnt=-1
+for fn in os.listdir(foldername):
+    cnt+=1
+    if fn.find("1953-1974")<0:
+        print ("DON't do:"+fn)
+        continue
+    if start is not None:
+        if cnt<start:
+            print("SKIP: %s"%fn)
+            continue
+
+    if end is not None:
+        if cnt>end:
+            sys.exit()
+    print("PROCESS: %s"%fn)
+
+
+
+    om.split(foldername+fn,onExistExit=False)
+    om.ocr(foldername+fn,onExistExit=False)