Mercurial > hg > nutch-mpiwg-plugins
comparison conf/automaton-urlfilter.txt.template @ 0:3b37d71af924 default tip
iniitial
author | dwinter |
---|---|
date | Tue, 26 Feb 2013 15:50:30 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3b37d71af924 |
---|---|
1 # Licensed to the Apache Software Foundation (ASF) under one or more | |
2 # contributor license agreements. See the NOTICE file distributed with | |
3 # this work for additional information regarding copyright ownership. | |
4 # The ASF licenses this file to You under the Apache License, Version 2.0 | |
5 # (the "License"); you may not use this file except in compliance with | |
6 # the License. You may obtain a copy of the License at | |
7 # | |
8 # http://www.apache.org/licenses/LICENSE-2.0 | |
9 # | |
10 # Unless required by applicable law or agreed to in writing, software | |
11 # distributed under the License is distributed on an "AS IS" BASIS, | |
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 # See the License for the specific language governing permissions and | |
14 # limitations under the License. | |
15 | |
16 # The default url filter. | |
17 # Better for whole-internet crawling. | |
18 | |
19 # Each non-comment, non-blank line contains a regular expression | |
20 # prefixed by '+' or '-'. The first matching pattern in the file | |
21 # determines whether a URL is included or ignored. If no pattern | |
22 # matches, the URL is ignored. | |
23 | |
24 # skip file: ftp: and mailto: urls | |
25 -(file|ftp|mailto):.* | |
26 | |
27 # skip image and other suffixes we can't yet parse | |
28 # for a more extensive coverage use the urlfilter-suffix plugin | |
29 -.*\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS) | |
30 | |
31 # skip URLs containing certain characters as probable queries, etc. | |
32 -.*[?*!@=].* | |
33 | |
34 # accept anything else | |
35 +.* |