Cutting out images: arch_cut_images.py

File arch_cut_images.py, 5.5 KB (added by Klaus Thoden, 13 years ago)

Same tool for Archimedes files, one day, it will be one tool for all

Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# This file is meant to cut images out of whole page images in order to
4# form a directory of figures so that they can be displayed. The input
5# format is a list of URLs containing digilib coordinates.
6#
7# Imagemagick is then called as an external program to cut out the images.
8#
9# Example: http://echo.mpiwg-berlin.mpg.de/ECHOdocuView?pn=3&ws=1&wx=0.0543&wy=0.251&ww=0.7991&wh=0.5829&url=/mpiwg/online/permanent/library/PUBSU9QD&viewMode=images&tocMode=thumbs&tocPN=1&searchPN=1
10#
11# Feature Request: Commented lines to have notes that some pictures should not be cut out
12
13import string, re, os, subprocess, sys
14
15# figfile = "Alvarus_1509_YHKVZ7B4.fig"
16# ID = "PUBSU9QD"
17
18archfigfile=sys.argv[1]
19
20arch_ID = str(re.findall(r"_[0-9]{3}_",archfigfile))[3:-3]
21
22arch_dir = "/Volumes/online_permanent/archimedes_repository/large/%s" % archfigfile[:-4]
23arch_pi = "%s/%s-01-pageimg" % (arch_dir, arch_ID)
24arch_fig = "%s/figures" % arch_dir
25
26arch_URL = "http://echo.mpiwg-berlin.mpg.de/ECHOdocuView?url=/mpiwg/online/permanent/archimedes/%s&tocMode=figures" % archfigfile[:-4]
27
28def leadZero(num,length=2):
29    '''Print leading zeros, very useful, length is also more
30    contrallable now, but we have a default value.'''
31    tmp = str(num)
32    return "0"*(length-len(tmp)) + tmp
33# def leadZero ends here
34
35
36def identify(dim,filename):
37    """Short function for call to imagemagick's identify. Big problem
38    here is how you can actually save the output of subprocess."""
39    # This is it: I write the output of the shell into a variable
40    output = subprocess.Popen(["identify","-format", dim, filename],stdout=subprocess.PIPE).communicate()[0]
41    return output
42# def identify ends here
43
44def getpageimg(pidir):
45    """Returns a list of the contents of the pageimg directory"""
46    images = os.listdir(pidir)
47    images.sort()
48    return images
49# def getpageimg ends here
50
51# create a figures directory, if necessary
52if os.path.exists(arch_fig):
53    print "Figures directory %s already exists, please check." % arch_fig
54    sys.exit()
55else:
56    print "Creating direcory %s" % arch_fig
57    os.mkdir(arch_fig)
58
59# if not os.path.exists(arch_fig):
60#     os.mkdir(arch_fig)
61
62# Get the pageimages and put them in a list
63imglist = getpageimg(arch_pi)
64
65# Three counters:
66# Counter is for checking if there is more than one figure on one page.
67imagecounter = 1
68
69# Use linecount to count the lines, this one is used so that the
70# old_pn is activated after the first iteration
71linecount = 0
72
73# The cutcount is for counting cut out figures
74cutcount = 0
75
76# Checker for comments
77comment = 0
78
79# read lines
80#filein = open(figfile,"r")
81filein = open(archfigfile,"r")
82
83print "Getting images from", arch_pi
84
85for line in filein:
86    line = line.rstrip('\n') # removes line break.
87
88    if line.startswith("#"):
89        comment = 1
90        print "Commented line"
91        linecount = linecount + 1
92        old_pn = pn
93    else:
94        # ws = str(re.findall(r"ws=[0-9.]*?&",line))[5:-3]
95        pn = str(re.findall(r"pn=[0-9]*?&",line))[5:-3]
96        wx = str(re.findall(r"wx=[0-9.]*?&",line))[5:-3]
97        wy = str(re.findall(r"wy=[0-9.]*?&",line))[5:-3]
98        ww = str(re.findall(r"ww=[0-9.]*?&",line))[5:-3]
99        wh = str(re.findall(r"wh=[0-9.]*?&",line))[5:-3]
100        # print leadZero(pn,3), wx, wy, ww, wh
101
102        if linecount > 0:
103            if pn == old_pn:
104                imagecounter = imagecounter + 1
105            elif pn != old_pn:
106                imagecounter = 1
107   
108        # filename = "%s.jpg" % (leadZero(pn,3))
109        # For local use, temporary. Normally, this one would point the tif
110        # folder of the page images
111        # filename = "/Users/kthoden/ECHO_pictures/apian_1550/%s.jpg" % (leadZero(pn,3))
112
113        # Decrease pn by 1 to adjust to counting of imglist, not anymore?
114        pnkorr = int(pn)-1
115        # Construct filename based on foxridge directory and index in pageimg directory
116        filename = "%s/%s" % (arch_pi,imglist[pnkorr])
117   
118        # Calculate size of image
119        width_img = float(identify("%w",filename)) 
120        height_img = float(identify("%h",filename)) 
121
122        # absolute coordinates of upper left corner
123        abs_x = float(wx) * width_img
124        abs_y = float(wy) * height_img
125   
126        # print abs_x, abs_y
127
128        # absolute size of region we want to extract
129        abs_w = float(ww) * width_img
130        abs_h = float(wh) * height_img
131
132        # print abs_w, abs_h
133
134        # Build a string for the figure's filename, based on pn-index
135        pagename = str(imglist[pnkorr])[:-4]
136
137        # Still to be tested: can I use following notation also in the call itself?
138        coord_for_im = "%sx%s+%s+%s" % (abs_w,abs_h,abs_x,abs_y)
139        output = "%s/%s-%s.jpg" % (arch_fig,pagename,leadZero(imagecounter))
140
141        subprocess.call(["convert", "-extract", coord_for_im,filename, output])
142   
143        print "Extracting image %s on page %s" % (leadZero(imagecounter),imglist[pnkorr])
144
145        # Count line, store old page number.
146        cutcount = cutcount + 1       
147        linecount = linecount + 1
148        old_pn = pn
149   
150    # Controls
151    # print "linecount", linecount
152    # print "old_pn", old_pn
153    # raw_input()                 # Press Enter to continue
154
155print "Wrote %d figures" % (cutcount)
156if comment == 1:
157    print "There were commented lines. This might mean that you have to edit the source xml file to remove erroneous image tags. "
158
159
160answer = raw_input("Do you want to view the result in a browser? [Y/n] ").lower()
161if answer == "y":
162    subprocess.call(["open",arch_URL])
163else:
164    sys.exit()