Cutting out images: cut_images.py

File cut_images.py, 5.5 KB (added by Klaus Thoden, 13 years ago)

A bit more comfortable

Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# This file is meant to cut images out of whole page images in order to
4# form a directory of figures so that they can be displayed. The input
5# format is a list of URLs containing digilib coordinates.
6#
7# Imagemagick is then called as an external program to cut out the images.
8#
9# Example: http://echo.mpiwg-berlin.mpg.de/ECHOdocuView?pn=3&ws=1&wx=0.0543&wy=0.251&ww=0.7991&wh=0.5829&url=/mpiwg/online/permanent/library/PUBSU9QD&viewMode=images&tocMode=thumbs&tocPN=1&searchPN=1
10#
11# Feature Request: Commented lines to have notes that some pictures should not be cut out
12
13import string, re, os, subprocess, sys
14
15# figfile = "Alvarus_1509_YHKVZ7B4.fig"
16# ID = "PUBSU9QD"
17
18figfile=sys.argv[1]
19# Two alternatives of finding out ID
20# ID = figfile[-12:-4]
21ID = str(re.findall(r"[A-Z0-9]{8}\.",figfile))[2:-3]
22
23fox_dir = "/Volumes/online_permanent/library/%s" % ID
24fox_pi = "%s/pageimg" % fox_dir
25fox_fig = "%s/figures" % fox_dir
26
27URL = "http://echo.mpiwg-berlin.mpg.de/ECHOdocuView/template/fulltextclient?url=/mpiwg/online/permanent/library/%s&viewMode=text_dict&tocMode=figures&tocPN=1" % ID
28
29#local_dir = "/Users/kthoden/ECHO_pictures/apian_1550/"
30
31def leadZero(num,length=2):
32    '''Print leading zeros, very useful, length is also more
33    contrallable now, but we have a default value.'''
34    tmp = str(num)
35    return "0"*(length-len(tmp)) + tmp
36# def leadZero ends here
37
38
39def identify(dim,filename):
40    """Short function for call to imagemagick's identify. Big problem
41    here is how you can actually save the output of subprocess."""
42    # This is it: I write the output of the shell into a variable
43    output = subprocess.Popen(["identify","-format", dim, filename],stdout=subprocess.PIPE).communicate()[0]
44    return output
45# def identify ends here
46
47def getpageimg(pidir):
48    """Returns a list of the contents of the pageimg directory"""
49    images = os.listdir(pidir)
50    images.sort()
51    return images
52# def getpageimg ends here
53
54# create a figures directory, if necessary
55if os.path.exists(fox_fig):
56    print "Figures directory %s already exists, please check." % fox_fig
57    sys.exit()
58else:
59    print "Creating direcory %s" % fox_fig
60    os.mkdir(fox_fig)
61
62# if not os.path.exists(fox_fig):
63#     os.mkdir(fox_fig)
64
65# Get the pageimages and put them in a list
66imglist = getpageimg(fox_pi)
67
68# Three counters:
69# Counter is for checking if there is more than one figure on one page.
70imagecounter = 1
71
72# Use linecount to count the lines, this one is used so that the
73# old_pn is activated after the first iteration
74linecount = 0
75
76# The cutcount is for counting cut out figures
77cutcount = 0
78
79# Checker for comments
80comment = 0
81
82# read lines
83filein = open(figfile,"r")
84
85print "Getting images from", fox_pi
86
87for line in filein:
88    line = line.rstrip('\n') # removes line break.
89
90    if line.startswith("#"):
91        comment = 1
92        print "Commented line"
93        linecount = linecount + 1
94        old_pn = pn
95    else:
96        # ws = str(re.findall(r"ws=[0-9.]*?&",line))[5:-3]
97        pn = str(re.findall(r"pn=[0-9]*?&",line))[5:-3]
98        wx = str(re.findall(r"wx=[0-9.]*?&",line))[5:-3]
99        wy = str(re.findall(r"wy=[0-9.]*?&",line))[5:-3]
100        ww = str(re.findall(r"ww=[0-9.]*?&",line))[5:-3]
101        wh = str(re.findall(r"wh=[0-9.]*?&",line))[5:-3]
102        # print leadZero(pn,3), wx, wy, ww, wh
103
104        if linecount > 0:
105            if pn == old_pn:
106                imagecounter = imagecounter + 1
107            elif pn != old_pn:
108                imagecounter = 1
109   
110        # filename = "%s.jpg" % (leadZero(pn,3))
111        # For local use, temporary. Normally, this one would point the tif
112        # folder of the page images
113        # filename = "/Users/kthoden/ECHO_pictures/apian_1550/%s.jpg" % (leadZero(pn,3))
114
115        # Decrease pn by 1 to adjust to counting of imglist, not anymore?
116        pnkorr = int(pn)-1
117        # Construct filename based on foxridge directory and index in pageimg directory
118        filename = "%s/%s" % (fox_pi,imglist[pnkorr])
119   
120        # Calculate size of image
121        width_img = float(identify("%w",filename)) 
122        height_img = float(identify("%h",filename)) 
123
124        # absolute coordinates of upper left corner
125        abs_x = float(wx) * width_img
126        abs_y = float(wy) * height_img
127   
128        # print abs_x, abs_y
129
130        # absolute size of region we want to extract
131        abs_w = float(ww) * width_img
132        abs_h = float(wh) * height_img
133
134        # print abs_w, abs_h
135
136        # Build a string for the figure's filename, based on pn-index
137        pagename = str(imglist[pnkorr])[:-4]
138
139        # Still to be tested: can I use following notation also in the call itself?
140        coord_for_im = "%sx%s+%s+%s" % (abs_w,abs_h,abs_x,abs_y)
141        output = "%s/%s-%s.jpg" % (fox_fig,pagename,leadZero(imagecounter))
142
143        subprocess.call(["convert", "-extract", coord_for_im,filename, output])
144   
145        print "Extracting image %s on page %s" % (leadZero(imagecounter),imglist[pnkorr])
146
147        # Count line, store old page number.
148        cutcount = cutcount + 1       
149        linecount = linecount + 1
150        old_pn = pn
151   
152    # Controls
153    # print "linecount", linecount
154    # print "old_pn", old_pn
155    # raw_input()                 # Press Enter to continue
156
157print "Wrote %d figures" % (cutcount)
158if comment == 1:
159    print "There were commented lines. This might mean that you have to edit the source xml file to remove erroneous image tags. "
160
161
162answer = raw_input("Do you want to view the result in a browser? [Y/n] ").lower()
163if answer == "y":
164    subprocess.call(["open",URL])
165else:
166    sys.exit()