annotate conf/regex-normalize.xml @ 0:3b37d71af924 default tip

iniitial
author dwinter
date Tue, 26 Feb 2013 15:50:30 +0100
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3b37d71af924 iniitial
dwinter
parents:
diff changeset
1 <?xml version="1.0"?>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
2 <!--
3b37d71af924 iniitial
dwinter
parents:
diff changeset
3 Licensed to the Apache Software Foundation (ASF) under one or more
3b37d71af924 iniitial
dwinter
parents:
diff changeset
4 contributor license agreements. See the NOTICE file distributed with
3b37d71af924 iniitial
dwinter
parents:
diff changeset
5 this work for additional information regarding copyright ownership.
3b37d71af924 iniitial
dwinter
parents:
diff changeset
6 The ASF licenses this file to You under the Apache License, Version 2.0
3b37d71af924 iniitial
dwinter
parents:
diff changeset
7 (the "License"); you may not use this file except in compliance with
3b37d71af924 iniitial
dwinter
parents:
diff changeset
8 the License. You may obtain a copy of the License at
3b37d71af924 iniitial
dwinter
parents:
diff changeset
9
3b37d71af924 iniitial
dwinter
parents:
diff changeset
10 http://www.apache.org/licenses/LICENSE-2.0
3b37d71af924 iniitial
dwinter
parents:
diff changeset
11
3b37d71af924 iniitial
dwinter
parents:
diff changeset
12 Unless required by applicable law or agreed to in writing, software
3b37d71af924 iniitial
dwinter
parents:
diff changeset
13 distributed under the License is distributed on an "AS IS" BASIS,
3b37d71af924 iniitial
dwinter
parents:
diff changeset
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
3b37d71af924 iniitial
dwinter
parents:
diff changeset
15 See the License for the specific language governing permissions and
3b37d71af924 iniitial
dwinter
parents:
diff changeset
16 limitations under the License.
3b37d71af924 iniitial
dwinter
parents:
diff changeset
17 -->
3b37d71af924 iniitial
dwinter
parents:
diff changeset
18 <!-- This is the configuration file for the RegexUrlNormalize Class.
3b37d71af924 iniitial
dwinter
parents:
diff changeset
19 This is intended so that users can specify substitutions to be
3b37d71af924 iniitial
dwinter
parents:
diff changeset
20 done on URLs. The regex engine that is used is Perl5 compatible.
3b37d71af924 iniitial
dwinter
parents:
diff changeset
21 The rules are applied to URLs in the order they occur in this file. -->
3b37d71af924 iniitial
dwinter
parents:
diff changeset
22
3b37d71af924 iniitial
dwinter
parents:
diff changeset
23 <!-- WATCH OUT: an xml parser reads this file an ampersands must be
3b37d71af924 iniitial
dwinter
parents:
diff changeset
24 expanded to &amp; -->
3b37d71af924 iniitial
dwinter
parents:
diff changeset
25
3b37d71af924 iniitial
dwinter
parents:
diff changeset
26 <!-- The following rules show how to strip out session IDs, default pages,
3b37d71af924 iniitial
dwinter
parents:
diff changeset
27 interpage anchors, etc. Order does matter! -->
3b37d71af924 iniitial
dwinter
parents:
diff changeset
28 <regex-normalize>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
29
3b37d71af924 iniitial
dwinter
parents:
diff changeset
30 <!-- removes session ids from urls (such as jsessionid and PHPSESSID) -->
3b37d71af924 iniitial
dwinter
parents:
diff changeset
31 <regex>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
32 <pattern>(?i)(;?\b_?(l|j|bv_)?(sid|phpsessid|sessionid)=.*?)(\?|&amp;|#|$)</pattern>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
33 <substitution>$4</substitution>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
34 </regex>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
35
3b37d71af924 iniitial
dwinter
parents:
diff changeset
36 <!-- changes default pages into standard for /index.html, etc. into /
3b37d71af924 iniitial
dwinter
parents:
diff changeset
37 <regex>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
38 <pattern>/((?i)index|default)\.((?i)js[pf]{1}?[afx]?|cgi|cfm|asp[x]?|[psx]?htm[l]?|php[3456]?)(\?|&amp;|#|$)</pattern>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
39 <substitution>/$3</substitution>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
40 </regex> -->
3b37d71af924 iniitial
dwinter
parents:
diff changeset
41
3b37d71af924 iniitial
dwinter
parents:
diff changeset
42 <!-- removes interpage href anchors such as site.com#location -->
3b37d71af924 iniitial
dwinter
parents:
diff changeset
43 <regex>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
44 <pattern>#.*?(\?|&amp;|$)</pattern>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
45 <substitution>$1</substitution>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
46 </regex>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
47
3b37d71af924 iniitial
dwinter
parents:
diff changeset
48 <!-- cleans ?&amp;var=value into ?var=value -->
3b37d71af924 iniitial
dwinter
parents:
diff changeset
49 <regex>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
50 <pattern>\?&amp;</pattern>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
51 <substitution>\?</substitution>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
52 </regex>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
53
3b37d71af924 iniitial
dwinter
parents:
diff changeset
54 <!-- cleans multiple sequential ampersands into a single ampersand -->
3b37d71af924 iniitial
dwinter
parents:
diff changeset
55 <regex>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
56 <pattern>&amp;{2,}</pattern>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
57 <substitution>&amp;</substitution>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
58 </regex>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
59
3b37d71af924 iniitial
dwinter
parents:
diff changeset
60 <!-- removes trailing ? -->
3b37d71af924 iniitial
dwinter
parents:
diff changeset
61 <regex>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
62 <pattern>[\?&amp;\.]$</pattern>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
63 <substitution></substitution>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
64 </regex>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
65
3b37d71af924 iniitial
dwinter
parents:
diff changeset
66 <!-- removes duplicate slashes -->
3b37d71af924 iniitial
dwinter
parents:
diff changeset
67 <regex>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
68 <pattern>(?&lt;!:)/{2,}</pattern>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
69 <substitution>/</substitution>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
70 </regex>
3b37d71af924 iniitial
dwinter
parents:
diff changeset
71
3b37d71af924 iniitial
dwinter
parents:
diff changeset
72 </regex-normalize>