Mercurial > hg > nutch-mpiwg-plugins
comparison conf/regex-normalize.xml @ 0:3b37d71af924 default tip
iniitial
author | dwinter |
---|---|
date | Tue, 26 Feb 2013 15:50:30 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3b37d71af924 |
---|---|
1 <?xml version="1.0"?> | |
2 <!-- | |
3 Licensed to the Apache Software Foundation (ASF) under one or more | |
4 contributor license agreements. See the NOTICE file distributed with | |
5 this work for additional information regarding copyright ownership. | |
6 The ASF licenses this file to You under the Apache License, Version 2.0 | |
7 (the "License"); you may not use this file except in compliance with | |
8 the License. You may obtain a copy of the License at | |
9 | |
10 http://www.apache.org/licenses/LICENSE-2.0 | |
11 | |
12 Unless required by applicable law or agreed to in writing, software | |
13 distributed under the License is distributed on an "AS IS" BASIS, | |
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
15 See the License for the specific language governing permissions and | |
16 limitations under the License. | |
17 --> | |
18 <!-- This is the configuration file for the RegexUrlNormalize Class. | |
19 This is intended so that users can specify substitutions to be | |
20 done on URLs. The regex engine that is used is Perl5 compatible. | |
21 The rules are applied to URLs in the order they occur in this file. --> | |
22 | |
23 <!-- WATCH OUT: an xml parser reads this file an ampersands must be | |
24 expanded to & --> | |
25 | |
26 <!-- The following rules show how to strip out session IDs, default pages, | |
27 interpage anchors, etc. Order does matter! --> | |
28 <regex-normalize> | |
29 | |
30 <!-- removes session ids from urls (such as jsessionid and PHPSESSID) --> | |
31 <regex> | |
32 <pattern>(?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|&|#|$)</pattern> | |
33 <substitution>$4</substitution> | |
34 </regex> | |
35 | |
36 <!-- changes default pages into standard for /index.html, etc. into / | |
37 <regex> | |
38 <pattern>/((?i)index|default)\.((?i)js[pf]{1}?[afx]?|cgi|cfm|asp[x]?|[psx]?htm[l]?|php[3456]?)(\?|&|#|$)</pattern> | |
39 <substitution>/$3</substitution> | |
40 </regex> --> | |
41 | |
42 <!-- removes interpage href anchors such as site.com#location --> | |
43 <regex> | |
44 <pattern>#.*?(\?|&|$)</pattern> | |
45 <substitution>$1</substitution> | |
46 </regex> | |
47 | |
48 <!-- cleans ?&var=value into ?var=value --> | |
49 <regex> | |
50 <pattern>\?&</pattern> | |
51 <substitution>\?</substitution> | |
52 </regex> | |
53 | |
54 <!-- cleans multiple sequential ampersands into a single ampersand --> | |
55 <regex> | |
56 <pattern>&{2,}</pattern> | |
57 <substitution>&</substitution> | |
58 </regex> | |
59 | |
60 <!-- removes trailing ? --> | |
61 <regex> | |
62 <pattern>[\?&\.]$</pattern> | |
63 <substitution></substitution> | |
64 </regex> | |
65 | |
66 <!-- removes duplicate slashes --> | |
67 <regex> | |
68 <pattern>(?<!:)/{2,}</pattern> | |
69 <substitution>/</substitution> | |
70 </regex> | |
71 | |
72 </regex-normalize> |