View Javadoc
1   package org.archive.io.hbase;
2   
3   /**
4    * Copyright 2010 The Apache Software Foundation
5    *
6    * Licensed to the Apache Software Foundation (ASF) under one
7    * or more contributor license agreements.  See the NOTICE file
8    * distributed with this work for additional information
9    * regarding copyright ownership.  The ASF licenses this file
10   * to you under the Apache License, Version 2.0 (the
11   * "License"); you may not use this file except in compliance
12   * with the License.  You may obtain a copy of the License at
13   *
14   *     http://www.apache.org/licenses/LICENSE-2.0
15   *
16   * Unless required by applicable law or agreed to in writing, software
17   * distributed under the License is distributed on an "AS IS" BASIS,
18   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19   * See the License for the specific language governing permissions and
20   * limitations under the License.
21   */
22  
23  import java.util.StringTokenizer;
24  import java.util.regex.Matcher;
25  import java.util.regex.Pattern;
26  
27  /**
28   * Utility creating hbase friendly keys. Use fabricating row names or column
29   * qualifiers.
30   * <p>
31   * TODO: Add createSchemeless key, a key that doesn't care if scheme is http or
32   * https.
33   * 
34   * @see Bytes#split(byte[], byte[], int)
35   */
36  public class Keying {
37  	public static final String REFERER_URL_SCHEME = "r:";
38  
39  	private static final Pattern URI_RE_PARSER = Pattern.compile("^([^:/?#]+://(?:[^/?#@]+@)?)([^:/?#]+)(.*)$");
40  
41  	public static final String DOMAIN_NAME_DELIMITER = ".";
42  
43  	/**
44  	 * Makes a key out of passed URI for use as row name or column qualifier.
45  	 * 
46  	 * This method runs transforms on the passed URI so it sits better as a key
47  	 * (or portion-of-a-key) in hbase. The <code>host</code> portion of the URI
48  	 * authority is reversed so subdomains sort under their parent domain. The
49  	 * returned String is an opaque URI of an artificial <code>r:</code> scheme
50  	 * to prevent the result being considered an URI of the original scheme.
51  	 * Here is an example of the transform: The url
52  	 * <code>http://lucene.apache.org/index.html?query=something#middle<code> is
53  	 * returned as
54  	 * <code>r:http://org.apache.lucene/index.html?query=something#middle</code>
55  	 * The transforms are reversible. No transform is done if passed URI is not
56  	 * hierarchical.
57  	 * 
58  	 * <p>
59  	 * If authority <code>userinfo</code> is present, will mess up the sort
60  	 * (until we do more work).
61  	 * </p>
62  	 * 
63  	 * @param u
64  	 *            URL to transform.
65  	 * @return An opaque URI of artificial 'r' scheme with host portion of URI
66  	 *         authority reversed (if present).
67  	 * @see #keyToUri(String)
68  	 * @see <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC2396</a>
69  	 */
70  
71  	public static String createKey(final String u, String scheme) {
72  		if (scheme != null && scheme.length() > 0 && u.startsWith(scheme)) {
73  			throw new IllegalArgumentException("Key already starts with a scheme: " + scheme);
74  		}
75  		Matcher m = getURIMatcher(u);
76  		if (m == null || !m.matches()) {
77  			// If no match, return original String.
78  			return u;
79  		}
80  		return scheme + m.group(1) + reverseHostname(m.group(2)) + m.group(3);
81  	}
82  
83  	/**
84  	 * Reverse the {@link #createKey(String)} transform.
85  	 * 
86  	 * @param s
87  	 *            <code>URI</code> made by {@link #createKey(String)}.
88  	 * @return 'Restored' URI made by reversing the {@link #createKey(String)}
89  	 *         transform.
90  	 */
91  	public static String keyToUri(final String s, final String scheme) {
92  		if (scheme == null || s == null) {
93  			return s;
94  		} else if (!s.toLowerCase().startsWith(scheme.toLowerCase())) {
95  			return s;
96  		}
97  		// here we have a matching scheme
98  		Matcher uriMatchObject = getURIMatcher(s.substring(scheme.length()));
99  		if (uriMatchObject == null || !uriMatchObject.matches()) {
100 			// If no match, return original String.
101 			return s;
102 		}
103 		// only return a modified key if we have a matching scheme and both
104 		// arguments are not null
105 		return uriMatchObject.group(1) + reverseHostname(uriMatchObject.group(2)) + uriMatchObject.group(3);
106 	}
107 
108 	private static Matcher getURIMatcher(final String uriText) {
109 		if (uriText == null || uriText.length() <= 0) {
110 			return null;
111 		}
112 		return URI_RE_PARSER.matcher(uriText);
113 	}
114 
115 	public static String reverseHostname(final String hostname) {
116 		if (hostname == null) {
117 			return "";
118 		}
119 		StringBuilder sb = new StringBuilder(hostname.length());
120 		Object next;
121 		for (StringTokenizer st = new StringTokenizer(hostname, DOMAIN_NAME_DELIMITER, false); st.hasMoreElements();) {
122 			next = st.nextElement();
123 			// prepend each element to the string buffer object to return a
124 			// revered list of the input.
125 			if (sb.length() > 0) {
126 				sb.insert(0, DOMAIN_NAME_DELIMITER);
127 			}
128 			sb.insert(0, next);
129 		}
130 		if (sb.length() != hostname.length()) {
131 			throw new RuntimeException("given hostname: " + hostname + " was reversed to reflect a revers'ed hostname: " + sb.toString()
132 			        + " but input and output string lengths do not match.  Please debug and fix immediately.");
133 		}
134 		return sb.toString();
135 	}
136 }