Context Navigation

Back to Cutting out images

Cutting out images: cut_images.py

File cut_images.py, 5.5 KB (added by Klaus Thoden, 13 years ago)
A bit more comfortable

Line
1	#!/usr/bin/python
2	# -- coding: utf-8 --
3	# This file is meant to cut images out of whole page images in order to
4	# form a directory of figures so that they can be displayed. The input
5	# format is a list of URLs containing digilib coordinates.
6	#
7	# Imagemagick is then called as an external program to cut out the images.
8	#
9	# Example: http://echo.mpiwg-berlin.mpg.de/ECHOdocuView?pn=3&ws=1&wx=0.0543&wy=0.251&ww=0.7991&wh=0.5829&url=/mpiwg/online/permanent/library/PUBSU9QD&viewMode=images&tocMode=thumbs&tocPN=1&searchPN=1
10	#
11	# Feature Request: Commented lines to have notes that some pictures should not be cut out
12
13	import string, re, os, subprocess, sys
14
15	# figfile = "Alvarus_1509_YHKVZ7B4.fig"
16	# ID = "PUBSU9QD"
17
18	figfile=sys.argv[1]
19	# Two alternatives of finding out ID
20	# ID = figfile[-12:-4]
21	ID = str(re.findall(r"[A-Z0-9]{8}\.",figfile))[2:-3]
22
23	fox_dir = "/Volumes/online_permanent/library/%s" % ID
24	fox_pi = "%s/pageimg" % fox_dir
25	fox_fig = "%s/figures" % fox_dir
26
27	URL = "http://echo.mpiwg-berlin.mpg.de/ECHOdocuView/template/fulltextclient?url=/mpiwg/online/permanent/library/%s&viewMode=text_dict&tocMode=figures&tocPN=1" % ID
28
29	#local_dir = "/Users/kthoden/ECHO_pictures/apian_1550/"
30
31	def leadZero(num,length=2):
32	'''Print leading zeros, very useful, length is also more
33	contrallable now, but we have a default value.'''
34	tmp = str(num)
35	return "0"*(length-len(tmp)) + tmp
36	# def leadZero ends here
37
38
39	def identify(dim,filename):
40	"""Short function for call to imagemagick's identify. Big problem
41	here is how you can actually save the output of subprocess."""
42	# This is it: I write the output of the shell into a variable
43	output = subprocess.Popen(["identify","-format", dim, filename],stdout=subprocess.PIPE).communicate()[0]
44	return output
45	# def identify ends here
46
47	def getpageimg(pidir):
48	"""Returns a list of the contents of the pageimg directory"""
49	images = os.listdir(pidir)
50	images.sort()
51	return images
52	# def getpageimg ends here
53
54	# create a figures directory, if necessary
55	if os.path.exists(fox_fig):
56	print "Figures directory %s already exists, please check." % fox_fig
57	sys.exit()
58	else:
59	print "Creating direcory %s" % fox_fig
60	os.mkdir(fox_fig)
61
62	# if not os.path.exists(fox_fig):
63	# os.mkdir(fox_fig)
64
65	# Get the pageimages and put them in a list
66	imglist = getpageimg(fox_pi)
67
68	# Three counters:
69	# Counter is for checking if there is more than one figure on one page.
70	imagecounter = 1
71
72	# Use linecount to count the lines, this one is used so that the
73	# old_pn is activated after the first iteration
74	linecount = 0
75
76	# The cutcount is for counting cut out figures
77	cutcount = 0
78
79	# Checker for comments
80	comment = 0
81
82	# read lines
83	filein = open(figfile,"r")
84
85	print "Getting images from", fox_pi
86
87	for line in filein:
88	line = line.rstrip('\n') # removes line break.
89
90	if line.startswith("#"):
91	comment = 1
92	print "Commented line"
93	linecount = linecount + 1
94	old_pn = pn
95	else:
96	# ws = str(re.findall(r"ws=[0-9.]*?&",line))[5:-3]
97	pn = str(re.findall(r"pn=[0-9]*?&",line))[5:-3]
98	wx = str(re.findall(r"wx=[0-9.]*?&",line))[5:-3]
99	wy = str(re.findall(r"wy=[0-9.]*?&",line))[5:-3]
100	ww = str(re.findall(r"ww=[0-9.]*?&",line))[5:-3]
101	wh = str(re.findall(r"wh=[0-9.]*?&",line))[5:-3]
102	# print leadZero(pn,3), wx, wy, ww, wh
103
104	if linecount > 0:
105	if pn == old_pn:
106	imagecounter = imagecounter + 1
107	elif pn != old_pn:
108	imagecounter = 1
109
110	# filename = "%s.jpg" % (leadZero(pn,3))
111	# For local use, temporary. Normally, this one would point the tif
112	# folder of the page images
113	# filename = "/Users/kthoden/ECHO_pictures/apian_1550/%s.jpg" % (leadZero(pn,3))
114
115	# Decrease pn by 1 to adjust to counting of imglist, not anymore?
116	pnkorr = int(pn)-1
117	# Construct filename based on foxridge directory and index in pageimg directory
118	filename = "%s/%s" % (fox_pi,imglist[pnkorr])
119
120	# Calculate size of image
121	width_img = float(identify("%w",filename))
122	height_img = float(identify("%h",filename))
123
124	# absolute coordinates of upper left corner
125	abs_x = float(wx) * width_img
126	abs_y = float(wy) * height_img
127
128	# print abs_x, abs_y
129
130	# absolute size of region we want to extract
131	abs_w = float(ww) * width_img
132	abs_h = float(wh) * height_img
133
134	# print abs_w, abs_h
135
136	# Build a string for the figure's filename, based on pn-index
137	pagename = str(imglist[pnkorr])[:-4]
138
139	# Still to be tested: can I use following notation also in the call itself?
140	coord_for_im = "%sx%s+%s+%s" % (abs_w,abs_h,abs_x,abs_y)
141	output = "%s/%s-%s.jpg" % (fox_fig,pagename,leadZero(imagecounter))
142
143	subprocess.call(["convert", "-extract", coord_for_im,filename, output])
144
145	print "Extracting image %s on page %s" % (leadZero(imagecounter),imglist[pnkorr])
146
147	# Count line, store old page number.
148	cutcount = cutcount + 1
149	linecount = linecount + 1
150	old_pn = pn
151
152	# Controls
153	# print "linecount", linecount
154	# print "old_pn", old_pn
155	# raw_input() # Press Enter to continue
156
157	print "Wrote %d figures" % (cutcount)
158	if comment == 1:
159	print "There were commented lines. This might mean that you have to edit the source xml file to remove erroneous image tags. "
160
161
162	answer = raw_input("Do you want to view the result in a browser? [Y/n] ").lower()
163	if answer == "y":
164	subprocess.call(["open",URL])
165	else:
166	sys.exit()

Download in other formats:

Original Format