|
0
|
1 from datetime import datetime
|
|
|
2
|
|
|
3 from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData, ForeignKey, DateTime, select
|
|
|
4
|
|
|
5 import settings
|
|
|
6
|
|
|
7 class CrawlerDb:
|
|
|
8
|
|
|
9 def __init__(self):
|
|
|
10 self.connected = False
|
|
|
11
|
|
|
12 def connect(self):
|
|
|
13 e = settings.DATABASE_ENGINE + "://"
|
|
|
14 p = ""
|
|
|
15 if settings.DATABASE_ENGINE == "mysql":
|
|
|
16 e += settings.DATABASE_USER + ":" + settings.DATABASE_PASS + "@"
|
|
|
17 p = ":" + settings.DATABASE_PORT
|
|
|
18
|
|
|
19 e += settings.DATABASE_HOST + p
|
|
|
20 if settings.DATABASE_ENGINE != "sqlite":
|
|
|
21 e += "/" +settings.DATABASE_NAME
|
|
|
22 self.engine = create_engine(e)
|
|
|
23 self.connection = self.engine.connect()
|
|
|
24 self.connected = True if self.connection else False
|
|
|
25 self.metadata = MetaData()
|
|
|
26
|
|
|
27 # Define the tables
|
|
|
28 self.queue_table = Table('queue', self.metadata,
|
|
|
29 Column('id', Integer, primary_key=True),
|
|
|
30 Column('address', String, nullable=False),
|
|
|
31 Column('added', DateTime, nullable=False, default=datetime.now())
|
|
|
32 )
|
|
|
33
|
|
|
34 self.crawl_table = Table('crawl', self.metadata,
|
|
|
35 Column('id', Integer, primary_key=True),
|
|
|
36 Column('address', String, nullable=False),
|
|
|
37 Column('http_status', String, nullable=False),
|
|
|
38 Column('title', String, nullable=True),
|
|
|
39 Column('size', Integer, nullable=True),
|
|
|
40
|
|
|
41 )
|
|
|
42
|
|
|
43 self.keyword_table = Table('keywords', self.metadata,
|
|
|
44 Column('id', Integer, primary_key=True),
|
|
|
45 Column('page_id', None, ForeignKey('crawl.id')),
|
|
|
46 Column('keyword', String, nullable=False),
|
|
|
47 Column('weight', Integer, nullable=False),
|
|
|
48 )
|
|
|
49
|
|
|
50 # Create the tables
|
|
|
51 self.metadata.create_all(self.engine)
|
|
|
52
|
|
|
53
|
|
|
54 def enqueue(self, urls):
|
|
|
55 if not self.connected:
|
|
|
56 return False
|
|
|
57 if len(urls) == 0:
|
|
|
58 return True
|
|
1
|
59 try:
|
|
|
60 args = [{'address':unicode(u)} for u in urls]
|
|
|
61 result = self.connection.execute(self.queue_table.insert(), args)
|
|
|
62 except:
|
|
|
63 return False
|
|
0
|
64 if result:
|
|
|
65 return True
|
|
|
66 return False
|
|
|
67
|
|
|
68 def dequeue(self):
|
|
|
69 if not self.connected:
|
|
|
70 return False
|
|
|
71 # Get the first thing in the queue
|
|
|
72 s = select([self.queue_table]).limit(1)
|
|
|
73 res = self.connection.execute(s)
|
|
|
74 result = res.fetchall()
|
|
|
75 res.close()
|
|
|
76 # If we get a result
|
|
|
77 if len(result) > 0:
|
|
|
78 # Remove from the queue
|
|
|
79 delres = self.connection.execute(self.queue_table.delete().where(self.queue_table.c.id == result[0][0]))
|
|
|
80 if not delres:
|
|
|
81 return False
|
|
|
82 # Return the row
|
|
|
83 return result[0][1]
|
|
|
84 return False
|
|
|
85
|
|
|
86 def checkCrawled(self, url):
|
|
1
|
87 try:
|
|
|
88 s = select([self.crawl_table]).where(self.crawl_table.c.address == unicode(url))
|
|
|
89 result = self.connection.execute(s)
|
|
|
90 if len(result.fetchall()) > 0:
|
|
|
91 result.close()
|
|
|
92 return True
|
|
|
93 else:
|
|
|
94 result.close()
|
|
|
95 return False
|
|
|
96 except:
|
|
0
|
97 return False
|
|
|
98 # Data should be a dictionary containing the following
|
|
|
99 # key : desc
|
|
|
100 # address : the url of the page
|
|
|
101 # http_status : the status code returned by the request
|
|
|
102 # title : the contents of the <title> element
|
|
|
103 # size : the of the returned content in bytes
|
|
|
104 def addPage(self, data):
|
|
|
105 if not self.connected:
|
|
|
106 return False
|
|
|
107 # Add the page to the crawl table
|
|
|
108 try:
|
|
|
109 result = self.connection.execute(self.crawl_table.insert().values(address=unicode(data['address']),http_status=data['status'],title=unicode(data['title']),size=data['size']))
|
|
|
110 except UnicodeDecodeError:
|
|
|
111 return False
|
|
|
112 if not result:
|
|
|
113 return False
|
|
|
114 # generate list of argument dictionaries for the insert many statement
|
|
2
|
115 args = [{"page_id":result.inserted_primary_key[0], "keyword":unicode(k.lower()), "weight":w} for k,w in data["keywords"].items()]
|
|
0
|
116 # Add all the keywords
|
|
|
117 if len(args) > 0:
|
|
|
118 result2 = self.connection.execute(self.keyword_table.insert(),args)
|
|
|
119 if not result2:
|
|
|
120 return False
|
|
|
121 return True
|
|
|
122
|
|
|
123 def close(self):
|
|
1
|
124 self.connection.close()
|