annotate query.py @ 2:6d8b6a689b2b default tip

changed to bs4
author dwinter
date Mon, 15 Oct 2012 15:09:35 +0200
parents 768cb7284374
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
57e2aa489383 initial
dwinter
parents:
diff changeset
1 from datetime import datetime
57e2aa489383 initial
dwinter
parents:
diff changeset
2
57e2aa489383 initial
dwinter
parents:
diff changeset
3 from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData, ForeignKey, DateTime, select
57e2aa489383 initial
dwinter
parents:
diff changeset
4
57e2aa489383 initial
dwinter
parents:
diff changeset
5 import settings
57e2aa489383 initial
dwinter
parents:
diff changeset
6
57e2aa489383 initial
dwinter
parents:
diff changeset
7 class CrawlerDb:
57e2aa489383 initial
dwinter
parents:
diff changeset
8
57e2aa489383 initial
dwinter
parents:
diff changeset
9 def __init__(self):
57e2aa489383 initial
dwinter
parents:
diff changeset
10 self.connected = False
57e2aa489383 initial
dwinter
parents:
diff changeset
11
57e2aa489383 initial
dwinter
parents:
diff changeset
12 def connect(self):
57e2aa489383 initial
dwinter
parents:
diff changeset
13 e = settings.DATABASE_ENGINE + "://"
57e2aa489383 initial
dwinter
parents:
diff changeset
14 p = ""
57e2aa489383 initial
dwinter
parents:
diff changeset
15 if settings.DATABASE_ENGINE == "mysql":
57e2aa489383 initial
dwinter
parents:
diff changeset
16 e += settings.DATABASE_USER + ":" + settings.DATABASE_PASS + "@"
57e2aa489383 initial
dwinter
parents:
diff changeset
17 p = ":" + settings.DATABASE_PORT
57e2aa489383 initial
dwinter
parents:
diff changeset
18
57e2aa489383 initial
dwinter
parents:
diff changeset
19 e += settings.DATABASE_HOST + p
57e2aa489383 initial
dwinter
parents:
diff changeset
20 if settings.DATABASE_ENGINE != "sqlite":
57e2aa489383 initial
dwinter
parents:
diff changeset
21 e += "/" +settings.DATABASE_NAME
57e2aa489383 initial
dwinter
parents:
diff changeset
22 self.engine = create_engine(e)
57e2aa489383 initial
dwinter
parents:
diff changeset
23 self.connection = self.engine.connect()
57e2aa489383 initial
dwinter
parents:
diff changeset
24 self.connected = True if self.connection else False
57e2aa489383 initial
dwinter
parents:
diff changeset
25 self.metadata = MetaData()
57e2aa489383 initial
dwinter
parents:
diff changeset
26
57e2aa489383 initial
dwinter
parents:
diff changeset
27 # Define the tables
57e2aa489383 initial
dwinter
parents:
diff changeset
28 self.queue_table = Table('queue', self.metadata,
57e2aa489383 initial
dwinter
parents:
diff changeset
29 Column('id', Integer, primary_key=True),
57e2aa489383 initial
dwinter
parents:
diff changeset
30 Column('address', String, nullable=False),
57e2aa489383 initial
dwinter
parents:
diff changeset
31 Column('added', DateTime, nullable=False, default=datetime.now())
57e2aa489383 initial
dwinter
parents:
diff changeset
32 )
57e2aa489383 initial
dwinter
parents:
diff changeset
33
57e2aa489383 initial
dwinter
parents:
diff changeset
34 self.crawl_table = Table('crawl', self.metadata,
57e2aa489383 initial
dwinter
parents:
diff changeset
35 Column('id', Integer, primary_key=True),
57e2aa489383 initial
dwinter
parents:
diff changeset
36 Column('address', String, nullable=False),
57e2aa489383 initial
dwinter
parents:
diff changeset
37 Column('http_status', String, nullable=False),
57e2aa489383 initial
dwinter
parents:
diff changeset
38 Column('title', String, nullable=True),
57e2aa489383 initial
dwinter
parents:
diff changeset
39 Column('size', Integer, nullable=True),
57e2aa489383 initial
dwinter
parents:
diff changeset
40
57e2aa489383 initial
dwinter
parents:
diff changeset
41 )
57e2aa489383 initial
dwinter
parents:
diff changeset
42
57e2aa489383 initial
dwinter
parents:
diff changeset
43 self.keyword_table = Table('keywords', self.metadata,
57e2aa489383 initial
dwinter
parents:
diff changeset
44 Column('id', Integer, primary_key=True),
57e2aa489383 initial
dwinter
parents:
diff changeset
45 Column('page_id', None, ForeignKey('crawl.id')),
57e2aa489383 initial
dwinter
parents:
diff changeset
46 Column('keyword', String, nullable=False),
57e2aa489383 initial
dwinter
parents:
diff changeset
47 Column('weight', Integer, nullable=False),
57e2aa489383 initial
dwinter
parents:
diff changeset
48 )
57e2aa489383 initial
dwinter
parents:
diff changeset
49
57e2aa489383 initial
dwinter
parents:
diff changeset
50 # Create the tables
57e2aa489383 initial
dwinter
parents:
diff changeset
51 self.metadata.create_all(self.engine)
57e2aa489383 initial
dwinter
parents:
diff changeset
52
57e2aa489383 initial
dwinter
parents:
diff changeset
53
57e2aa489383 initial
dwinter
parents:
diff changeset
54 def enqueue(self, urls):
57e2aa489383 initial
dwinter
parents:
diff changeset
55 if not self.connected:
57e2aa489383 initial
dwinter
parents:
diff changeset
56 return False
57e2aa489383 initial
dwinter
parents:
diff changeset
57 if len(urls) == 0:
57e2aa489383 initial
dwinter
parents:
diff changeset
58 return True
1
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
59 try:
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
60 args = [{'address':unicode(u)} for u in urls]
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
61 result = self.connection.execute(self.queue_table.insert(), args)
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
62 except:
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
63 return False
0
57e2aa489383 initial
dwinter
parents:
diff changeset
64 if result:
57e2aa489383 initial
dwinter
parents:
diff changeset
65 return True
57e2aa489383 initial
dwinter
parents:
diff changeset
66 return False
57e2aa489383 initial
dwinter
parents:
diff changeset
67
57e2aa489383 initial
dwinter
parents:
diff changeset
68 def dequeue(self):
57e2aa489383 initial
dwinter
parents:
diff changeset
69 if not self.connected:
57e2aa489383 initial
dwinter
parents:
diff changeset
70 return False
57e2aa489383 initial
dwinter
parents:
diff changeset
71 # Get the first thing in the queue
57e2aa489383 initial
dwinter
parents:
diff changeset
72 s = select([self.queue_table]).limit(1)
57e2aa489383 initial
dwinter
parents:
diff changeset
73 res = self.connection.execute(s)
57e2aa489383 initial
dwinter
parents:
diff changeset
74 result = res.fetchall()
57e2aa489383 initial
dwinter
parents:
diff changeset
75 res.close()
57e2aa489383 initial
dwinter
parents:
diff changeset
76 # If we get a result
57e2aa489383 initial
dwinter
parents:
diff changeset
77 if len(result) > 0:
57e2aa489383 initial
dwinter
parents:
diff changeset
78 # Remove from the queue
57e2aa489383 initial
dwinter
parents:
diff changeset
79 delres = self.connection.execute(self.queue_table.delete().where(self.queue_table.c.id == result[0][0]))
57e2aa489383 initial
dwinter
parents:
diff changeset
80 if not delres:
57e2aa489383 initial
dwinter
parents:
diff changeset
81 return False
57e2aa489383 initial
dwinter
parents:
diff changeset
82 # Return the row
57e2aa489383 initial
dwinter
parents:
diff changeset
83 return result[0][1]
57e2aa489383 initial
dwinter
parents:
diff changeset
84 return False
57e2aa489383 initial
dwinter
parents:
diff changeset
85
57e2aa489383 initial
dwinter
parents:
diff changeset
86 def checkCrawled(self, url):
1
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
87 try:
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
88 s = select([self.crawl_table]).where(self.crawl_table.c.address == unicode(url))
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
89 result = self.connection.execute(s)
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
90 if len(result.fetchall()) > 0:
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
91 result.close()
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
92 return True
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
93 else:
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
94 result.close()
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
95 return False
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
96 except:
0
57e2aa489383 initial
dwinter
parents:
diff changeset
97 return False
57e2aa489383 initial
dwinter
parents:
diff changeset
98 # Data should be a dictionary containing the following
57e2aa489383 initial
dwinter
parents:
diff changeset
99 # key : desc
57e2aa489383 initial
dwinter
parents:
diff changeset
100 # address : the url of the page
57e2aa489383 initial
dwinter
parents:
diff changeset
101 # http_status : the status code returned by the request
57e2aa489383 initial
dwinter
parents:
diff changeset
102 # title : the contents of the <title> element
57e2aa489383 initial
dwinter
parents:
diff changeset
103 # size : the of the returned content in bytes
57e2aa489383 initial
dwinter
parents:
diff changeset
104 def addPage(self, data):
57e2aa489383 initial
dwinter
parents:
diff changeset
105 if not self.connected:
57e2aa489383 initial
dwinter
parents:
diff changeset
106 return False
57e2aa489383 initial
dwinter
parents:
diff changeset
107 # Add the page to the crawl table
57e2aa489383 initial
dwinter
parents:
diff changeset
108 try:
57e2aa489383 initial
dwinter
parents:
diff changeset
109 result = self.connection.execute(self.crawl_table.insert().values(address=unicode(data['address']),http_status=data['status'],title=unicode(data['title']),size=data['size']))
57e2aa489383 initial
dwinter
parents:
diff changeset
110 except UnicodeDecodeError:
57e2aa489383 initial
dwinter
parents:
diff changeset
111 return False
57e2aa489383 initial
dwinter
parents:
diff changeset
112 if not result:
57e2aa489383 initial
dwinter
parents:
diff changeset
113 return False
57e2aa489383 initial
dwinter
parents:
diff changeset
114 # generate list of argument dictionaries for the insert many statement
2
6d8b6a689b2b changed to bs4
dwinter
parents: 1
diff changeset
115 args = [{"page_id":result.inserted_primary_key[0], "keyword":unicode(k.lower()), "weight":w} for k,w in data["keywords"].items()]
0
57e2aa489383 initial
dwinter
parents:
diff changeset
116 # Add all the keywords
57e2aa489383 initial
dwinter
parents:
diff changeset
117 if len(args) > 0:
57e2aa489383 initial
dwinter
parents:
diff changeset
118 result2 = self.connection.execute(self.keyword_table.insert(),args)
57e2aa489383 initial
dwinter
parents:
diff changeset
119 if not result2:
57e2aa489383 initial
dwinter
parents:
diff changeset
120 return False
57e2aa489383 initial
dwinter
parents:
diff changeset
121 return True
57e2aa489383 initial
dwinter
parents:
diff changeset
122
57e2aa489383 initial
dwinter
parents:
diff changeset
123 def close(self):
1
768cb7284374 lots of debug statements
dwinter
parents: 0
diff changeset
124 self.connection.close()