Mercurial > hg > nutch-mpiwg-plugins
comparison conf/regex-urlfilter.txt @ 0:3b37d71af924 default tip
iniitial
author | dwinter |
---|---|
date | Tue, 26 Feb 2013 15:50:30 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3b37d71af924 |
---|---|
1 # Licensed to the Apache Software Foundation (ASF) under one or more | |
2 # contributor license agreements. See the NOTICE file distributed with | |
3 # this work for additional information regarding copyright ownership. | |
4 # The ASF licenses this file to You under the Apache License, Version 2.0 | |
5 # (the "License"); you may not use this file except in compliance with | |
6 # the License. You may obtain a copy of the License at | |
7 # | |
8 # http://www.apache.org/licenses/LICENSE-2.0 | |
9 # | |
10 # Unless required by applicable law or agreed to in writing, software | |
11 # distributed under the License is distributed on an "AS IS" BASIS, | |
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 # See the License for the specific language governing permissions and | |
14 # limitations under the License. | |
15 | |
16 | |
17 # The default url filter. | |
18 # Better for whole-internet crawling. | |
19 | |
20 # Each non-comment, non-blank line contains a regular expression | |
21 # prefixed by '+' or '-'. The first matching pattern in the file | |
22 # determines whether a URL is included or ignored. If no pattern | |
23 # matches, the URL is ignored. | |
24 | |
25 # skip file: ftp: and mailto: urls | |
26 -^(file|ftp|mailto): | |
27 | |
28 # skip image and other suffixes we can't yet parse | |
29 # for a more extensive coverage use the urlfilter-suffix plugin | |
30 -\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$ | |
31 | |
32 # skip URLs containing certain characters as probable queries, etc. | |
33 -[?*!@=] | |
34 | |
35 # skip URLs with slash-delimited segment that repeats 3+ times, to break loops | |
36 -.*(/[^/]+)/[^/]+\1/[^/]+\1/ | |
37 | |
38 | |
39 # accept anything else | |
40 +^http://127.0.0.1:18080/www_neu/.* | |
41 #+^http://127.0.0.1:18080/www_neu/en/staff/index.html | |
42 | |
43 #+^http://127.0.0.1:18080/www_neu/en/staff/members/.* | |
44 | |
45 #+^http://127.0.0.1:18080/www_neu/en/research/index.html | |
46 | |
47 #+^http://127.0.0.1:18080/www_neu/en/research/projects/.* | |
48 | |
49 | |
50 | |
51 | |
52 |