1 package org.archive.io.hbase;
2
3 /**
4 * Copyright 2010 The Apache Software Foundation
5 *
6 * Licensed to the Apache Software Foundation (ASF) under one
7 * or more contributor license agreements. See the NOTICE file
8 * distributed with this work for additional information
9 * regarding copyright ownership. The ASF licenses this file
10 * to you under the Apache License, Version 2.0 (the
11 * "License"); you may not use this file except in compliance
12 * with the License. You may obtain a copy of the License at
13 *
14 * http://www.apache.org/licenses/LICENSE-2.0
15 *
16 * Unless required by applicable law or agreed to in writing, software
17 * distributed under the License is distributed on an "AS IS" BASIS,
18 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 * See the License for the specific language governing permissions and
20 * limitations under the License.
21 */
22
23 import java.util.StringTokenizer;
24 import java.util.regex.Matcher;
25 import java.util.regex.Pattern;
26
27 /**
28 * Utility creating hbase friendly keys. Use fabricating row names or column
29 * qualifiers.
30 * <p>
31 * TODO: Add createSchemeless key, a key that doesn't care if scheme is http or
32 * https.
33 *
34 * @see Bytes#split(byte[], byte[], int)
35 */
36 public class Keying {
37 public static final String REFERER_URL_SCHEME = "r:";
38
39 private static final Pattern URI_RE_PARSER = Pattern.compile("^([^:/?#]+://(?:[^/?#@]+@)?)([^:/?#]+)(.*)$");
40
41 public static final String DOMAIN_NAME_DELIMITER = ".";
42
43 /**
44 * Makes a key out of passed URI for use as row name or column qualifier.
45 *
46 * This method runs transforms on the passed URI so it sits better as a key
47 * (or portion-of-a-key) in hbase. The <code>host</code> portion of the URI
48 * authority is reversed so subdomains sort under their parent domain. The
49 * returned String is an opaque URI of an artificial <code>r:</code> scheme
50 * to prevent the result being considered an URI of the original scheme.
51 * Here is an example of the transform: The url
52 * <code>http://lucene.apache.org/index.html?query=something#middle<code> is
53 * returned as
54 * <code>r:http://org.apache.lucene/index.html?query=something#middle</code>
55 * The transforms are reversible. No transform is done if passed URI is not
56 * hierarchical.
57 *
58 * <p>
59 * If authority <code>userinfo</code> is present, will mess up the sort
60 * (until we do more work).
61 * </p>
62 *
63 * @param u
64 * URL to transform.
65 * @return An opaque URI of artificial 'r' scheme with host portion of URI
66 * authority reversed (if present).
67 * @see #keyToUri(String)
68 * @see <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC2396</a>
69 */
70
71 public static String createKey(final String u, String scheme) {
72 if (scheme != null && scheme.length() > 0 && u.startsWith(scheme)) {
73 throw new IllegalArgumentException("Key already starts with a scheme: " + scheme);
74 }
75 Matcher m = getURIMatcher(u);
76 if (m == null || !m.matches()) {
77 // If no match, return original String.
78 return u;
79 }
80 return scheme + m.group(1) + reverseHostname(m.group(2)) + m.group(3);
81 }
82
83 /**
84 * Reverse the {@link #createKey(String)} transform.
85 *
86 * @param s
87 * <code>URI</code> made by {@link #createKey(String)}.
88 * @return 'Restored' URI made by reversing the {@link #createKey(String)}
89 * transform.
90 */
91 public static String keyToUri(final String s, final String scheme) {
92 if (scheme == null || s == null) {
93 return s;
94 } else if (!s.toLowerCase().startsWith(scheme.toLowerCase())) {
95 return s;
96 }
97 // here we have a matching scheme
98 Matcher uriMatchObject = getURIMatcher(s.substring(scheme.length()));
99 if (uriMatchObject == null || !uriMatchObject.matches()) {
100 // If no match, return original String.
101 return s;
102 }
103 // only return a modified key if we have a matching scheme and both
104 // arguments are not null
105 return uriMatchObject.group(1) + reverseHostname(uriMatchObject.group(2)) + uriMatchObject.group(3);
106 }
107
108 private static Matcher getURIMatcher(final String uriText) {
109 if (uriText == null || uriText.length() <= 0) {
110 return null;
111 }
112 return URI_RE_PARSER.matcher(uriText);
113 }
114
115 public static String reverseHostname(final String hostname) {
116 if (hostname == null) {
117 return "";
118 }
119 StringBuilder sb = new StringBuilder(hostname.length());
120 Object next;
121 for (StringTokenizer st = new StringTokenizer(hostname, DOMAIN_NAME_DELIMITER, false); st.hasMoreElements();) {
122 next = st.nextElement();
123 // prepend each element to the string buffer object to return a
124 // revered list of the input.
125 if (sb.length() > 0) {
126 sb.insert(0, DOMAIN_NAME_DELIMITER);
127 }
128 sb.insert(0, next);
129 }
130 if (sb.length() != hostname.length()) {
131 throw new RuntimeException("given hostname: " + hostname + " was reversed to reflect a revers'ed hostname: " + sb.toString()
132 + " but input and output string lengths do not match. Please debug and fix immediately.");
133 }
134 return sb.toString();
135 }
136 }