| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| Keying |
|
| 7.0;7 |
| 1 | package org.archive.io.hbase; | |
| 2 | ||
| 3 | /** | |
| 4 | * Copyright 2010 The Apache Software Foundation | |
| 5 | * | |
| 6 | * Licensed to the Apache Software Foundation (ASF) under one | |
| 7 | * or more contributor license agreements. See the NOTICE file | |
| 8 | * distributed with this work for additional information | |
| 9 | * regarding copyright ownership. The ASF licenses this file | |
| 10 | * to you under the Apache License, Version 2.0 (the | |
| 11 | * "License"); you may not use this file except in compliance | |
| 12 | * with the License. You may obtain a copy of the License at | |
| 13 | * | |
| 14 | * http://www.apache.org/licenses/LICENSE-2.0 | |
| 15 | * | |
| 16 | * Unless required by applicable law or agreed to in writing, software | |
| 17 | * distributed under the License is distributed on an "AS IS" BASIS, | |
| 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 19 | * See the License for the specific language governing permissions and | |
| 20 | * limitations under the License. | |
| 21 | */ | |
| 22 | ||
| 23 | import java.util.StringTokenizer; | |
| 24 | import java.util.regex.Matcher; | |
| 25 | import java.util.regex.Pattern; | |
| 26 | ||
| 27 | /** | |
| 28 | * Utility creating hbase friendly keys. Use fabricating row names or column | |
| 29 | * qualifiers. | |
| 30 | * <p> | |
| 31 | * TODO: Add createSchemeless key, a key that doesn't care if scheme is http or | |
| 32 | * https. | |
| 33 | * | |
| 34 | * @see Bytes#split(byte[], byte[], int) | |
| 35 | */ | |
| 36 | 0 | public class Keying { |
| 37 | public static final String REFERER_URL_SCHEME = "r:"; | |
| 38 | ||
| 39 | 0 | private static final Pattern URI_RE_PARSER = Pattern.compile("^([^:/?#]+://(?:[^/?#@]+@)?)([^:/?#]+)(.*)$"); |
| 40 | ||
| 41 | public static final String DOMAIN_NAME_DELIMITER = "."; | |
| 42 | ||
| 43 | /** | |
| 44 | * Makes a key out of passed URI for use as row name or column qualifier. | |
| 45 | * | |
| 46 | * This method runs transforms on the passed URI so it sits better as a key | |
| 47 | * (or portion-of-a-key) in hbase. The <code>host</code> portion of the URI | |
| 48 | * authority is reversed so subdomains sort under their parent domain. The | |
| 49 | * returned String is an opaque URI of an artificial <code>r:</code> scheme | |
| 50 | * to prevent the result being considered an URI of the original scheme. | |
| 51 | * Here is an example of the transform: The url | |
| 52 | * <code>http://lucene.apache.org/index.html?query=something#middle<code> is | |
| 53 | * returned as | |
| 54 | * <code>r:http://org.apache.lucene/index.html?query=something#middle</code> | |
| 55 | * The transforms are reversible. No transform is done if passed URI is not | |
| 56 | * hierarchical. | |
| 57 | * | |
| 58 | * <p> | |
| 59 | * If authority <code>userinfo</code> is present, will mess up the sort | |
| 60 | * (until we do more work). | |
| 61 | * </p> | |
| 62 | * | |
| 63 | * @param u | |
| 64 | * URL to transform. | |
| 65 | * @return An opaque URI of artificial 'r' scheme with host portion of URI | |
| 66 | * authority reversed (if present). | |
| 67 | * @see #keyToUri(String) | |
| 68 | * @see <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC2396</a> | |
| 69 | */ | |
| 70 | ||
| 71 | public static String createKey(final String u, String scheme) { | |
| 72 | 0 | if (scheme != null && scheme.length() > 0 && u.startsWith(scheme)) { |
| 73 | 0 | throw new IllegalArgumentException("Key already starts with a scheme: " + scheme); |
| 74 | } | |
| 75 | 0 | Matcher m = getURIMatcher(u); |
| 76 | 0 | if (m == null || !m.matches()) { |
| 77 | // If no match, return original String. | |
| 78 | 0 | return u; |
| 79 | } | |
| 80 | 0 | return scheme + m.group(1) + reverseHostname(m.group(2)) + m.group(3); |
| 81 | } | |
| 82 | ||
| 83 | /** | |
| 84 | * Reverse the {@link #createKey(String)} transform. | |
| 85 | * | |
| 86 | * @param s | |
| 87 | * <code>URI</code> made by {@link #createKey(String)}. | |
| 88 | * @return 'Restored' URI made by reversing the {@link #createKey(String)} | |
| 89 | * transform. | |
| 90 | */ | |
| 91 | public static String keyToUri(final String s, final String scheme) { | |
| 92 | 0 | if (scheme == null || s == null) { |
| 93 | 0 | return s; |
| 94 | 0 | } else if (!s.toLowerCase().startsWith(scheme.toLowerCase())) { |
| 95 | 0 | return s; |
| 96 | } | |
| 97 | // here we have a matching scheme | |
| 98 | 0 | Matcher uriMatchObject = getURIMatcher(s.substring(scheme.length())); |
| 99 | 0 | if (uriMatchObject == null || !uriMatchObject.matches()) { |
| 100 | // If no match, return original String. | |
| 101 | 0 | return s; |
| 102 | } | |
| 103 | // only return a modified key if we have a matching scheme and both | |
| 104 | // arguments are not null | |
| 105 | 0 | return uriMatchObject.group(1) + reverseHostname(uriMatchObject.group(2)) + uriMatchObject.group(3); |
| 106 | } | |
| 107 | ||
| 108 | private static Matcher getURIMatcher(final String uriText) { | |
| 109 | 0 | if (uriText == null || uriText.length() <= 0) { |
| 110 | 0 | return null; |
| 111 | } | |
| 112 | 0 | return URI_RE_PARSER.matcher(uriText); |
| 113 | } | |
| 114 | ||
| 115 | public static String reverseHostname(final String hostname) { | |
| 116 | 0 | if (hostname == null) { |
| 117 | 0 | return ""; |
| 118 | } | |
| 119 | 0 | StringBuilder sb = new StringBuilder(hostname.length()); |
| 120 | Object next; | |
| 121 | 0 | for (StringTokenizer st = new StringTokenizer(hostname, DOMAIN_NAME_DELIMITER, false); st.hasMoreElements();) { |
| 122 | 0 | next = st.nextElement(); |
| 123 | // prepend each element to the string buffer object to return a | |
| 124 | // revered list of the input. | |
| 125 | 0 | if (sb.length() > 0) { |
| 126 | 0 | sb.insert(0, DOMAIN_NAME_DELIMITER); |
| 127 | } | |
| 128 | 0 | sb.insert(0, next); |
| 129 | } | |
| 130 | 0 | if (sb.length() != hostname.length()) { |
| 131 | 0 | throw new RuntimeException("given hostname: " + hostname + " was reversed to reflect a revers'ed hostname: " + sb.toString() |
| 132 | + " but input and output string lengths do not match. Please debug and fix immediately."); | |
| 133 | } | |
| 134 | 0 | return sb.toString(); |
| 135 | } | |
| 136 | } |