RAT (Release Audit Tool) results

The following document contains the results of RAT (Release Audit Tool) .

*****************************************************
Summary
-------
Notes: 3
Binaries: 0
Archives: 0
Standards: 8

Apache Licensed: 0
Generated Documents: 0

JavaDocs are generated and so license header is optional
Generated files do not required license headers

8 Unknown Licenses

*******************************

Archives (+ indicates readable, $ unreadable): 

 
*****************************************************
  Files with AL headers will be marked L
  Binary files (which do not require AL headers) will be marked B
  Compressed archives will be marked A
  Notices, licenses etc will be marked N
 !????? pom.xml
  N     CHANGELOG.txt
 !????? src/conf/modules/Processor.options
 !????? src/main/java/com/powerset/heritrix/writer/HBaseWriterPool.java
 !????? src/main/java/com/powerset/heritrix/writer/HBaseWriterProcessor.java
 !????? src/main/java/com/powerset/heritrix/writer/package.html
 !????? src/main/java/com/powerset/heritrix/writer/HBaseWriter.java
 !????? src/main/resources/com/powerset/heritrix/writer/HBaseWriterProcessor_en.utf8
 !????? src/main/resources/checkstyle.xml
  N     README.txt
  N     LICENSE.txt
 
 *****************************************************
 Printing headers for files without AL header...
 
 
 =======================================================================
 ==pom.xml
 =======================================================================
 <project xmlns="http://maven.apache.org/POM/4.0.0" 
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  	
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.powerset.heritrix</groupId>
  <artifactId>hbase-writer</artifactId>
  <packaging>jar</packaging>
  <version>0.18.1-SNAPSHOT</version>
  <name>HBase Writer</name>
  <description>A library for writing Heritrix output directly to HBase tables.</description>
  <url>http://code.google.com/p/hbase-writer</url>
  
  <inceptionYear>2007</inceptionYear>
  
  <issueManagement>
    <system>GoogleCode Issues</system>
    <url>http://code.google.com/p/hbase-writer/issues/list</url>
  </issueManagement>
  
  <mailingLists>
    <mailingList>
      <name>User List</name>
      <subscribe>hbase-user-subscribe@hadoop.apache.org</subscribe>
      <unsubscribe>hbase-user-unsubscribe@hadoop.apache.org</unsubscribe>
      <post>hbase-user@hadoop.apache.org</post>
      <archive>http://mail-archives.apache.org/mod_mbox/hadoop-hbase-user/</archive>
    </mailingList>
    
    <mailingList>
      <name>Dev List</name>
      <subscribe>hbase-dev-subscribe@hadoop.apache.org</subscribe>
      <unsubscribe>hbase-dev-unsubscribe@hadoop.apache.org</unsubscribe>
      <post>hbase-dev@hadoop.apache.org</post>
      <archive>http://mail-archives.apache.org/mod_mbox/hadoop-hbase-dev/</archive>
    </mailingList>
    
    <mailingList>
      <name>Commits List</name>
      <subscribe>hbase-commits-subscribe@hadoop.apache.org</subscribe>
      <unsubscribe>hbase-commits-unsubscribe@hadoop.apache.org</unsubscribe>
      <post>hbase-commits@hadoop.apache.org</post>
      <archive>http://mail-archives.apache.org/mod_mbox/hadoop-hbase-commits/</archive>
    </mailingList>
  </mailingLists>

  <licenses>
    <license>
      <name>LGPL</name>
      <url>http://www.gnu.org/copyleft/lesser.html</url>

 =======================================================================
 ==src/conf/modules/Processor.options
 =======================================================================
 # Availible processors.
# Each processor class should be listed with full package info
# followed by a '|' and a descriptive name (containing only [a-z,A-z])
# Lines beginning with # and empty lines are ignored
com.powerset.heritrix.writer.HBaseWriterProcessor|HBaseArchiver

 =======================================================================
 ==src/main/java/com/powerset/heritrix/writer/HBaseWriterPool.java
 =======================================================================
 /* HBaseWriterPool
 *
 * $Id$
 *
 * Created on June 23rd, 2007
 *
 * Copyright (C) 2007 stack
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package com.powerset.heritrix.writer;

import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.pool.BasePoolableObjectFactory;
import org.archive.io.DefaultWriterPoolSettings;
import org.archive.io.WriterPool;
import org.archive.io.WriterPoolMember;


/**
 * A pool of HBaseWriters.
 * @author stack
 */
public class HBaseWriterPool extends WriterPool {
  /**
   * Constructor
   * @param poolMaximumActive
   * @param poolMaximumWait
   */
  public HBaseWriterPool(final String master, final String table,
      final int poolMaximumActive, final int poolMaximumWait) {
    // Below is hard to follow.  Its invocation of this classes super
    // constructor passing a serial, an instance of BasePoolable.. that
    // is defined in line, followed by settings, max and wait.
    super(new AtomicInteger(), new BasePoolableObjectFactory() {

 =======================================================================
 ==src/main/java/com/powerset/heritrix/writer/HBaseWriterProcessor.java
 =======================================================================
 /*
 * HBaseWriterProcessor
 *
 * $Id$
 *
 * Created on June 23rd, 2007
 *
 * Copyright (C) 2007 stack
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package com.powerset.heritrix.writer;

import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.hadoop.hbase.HConstants;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPool;
import org.archive.io.WriterPoolMember;
import org.archive.modules.ModuleAttributeConstants;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.archive.modules.ProcessorURI;
import org.archive.modules.fetcher.FetchStatusCodes;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.archive.modules.net.ServerCacheUtil;
import org.archive.state.Expert;
import org.archive.state.Immutable;
import org.archive.state.Initializable;
import org.archive.state.Key;

 =======================================================================
 ==src/main/java/com/powerset/heritrix/writer/package.html
 =======================================================================
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head />
<body bgcolor="white">
Provides <a href="http://hbase.org">HBase</a> writer for
<a href="http://crawler.archive.org">heritrix</a>.

<h2>Requirements</h2>
TODO
</body>
</html>

 =======================================================================
 ==src/main/java/com/powerset/heritrix/writer/HBaseWriter.java
 =======================================================================
 /**
 * HBaseWriter
 *
 * $Id$
 *
 * Created on June 23rd, 2007
 *
 * Copyright (C) 2007 stack
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package com.powerset.heritrix.writer;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.io.BatchUpdate;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Keying;
import org.archive.io.ArchiveFileConstants;
import org.archive.io.RecordingInputStream;
import org.archive.io.RecordingOutputStream;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPoolMember;
import org.archive.modules.ProcessorURI;

/**
 * Write to HBase.

 =======================================================================
 ==src/main/resources/com/powerset/heritrix/writer/HBaseWriterProcessor_en.utf8
 =======================================================================
 description:
HBaseWriter processor.

server-cache-description:
The server cache used to resolve IP addresses.

pool-max-active-description:
Maximum active files in pool. This setting cannot be varied over the life
of a crawl.


pool-max-wait-description:
Maximum time to wait on pool element (milliseconds). This setting cannot
be varied over the life of a crawl.

total-bytes-to-write-description:
Total file bytes to write to disk. Once the size of all files on disk has
exceeded this limit, this processor will stop the crawler. A value of
zero means no upper limit.

table-description:
Name of the HBase table to write crawl to

master-description:
Master host and port: e.g. localhost:60010

 =======================================================================
 ==src/main/resources/checkstyle.xml
 =======================================================================
 <?xml version="1.0"?>
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.2//EN" "http://www.puppycrawl.com/dtds/configuration_1_2.dtd">

<!--

  Checkstyle configuration that checks the sun coding conventions from:

    - the Java Language Specification at
      http://java.sun.com/docs/books/jls/second_edition/html/index.html

    - the Sun Code Conventions at http://java.sun.com/docs/codeconv/

    - the Javadoc guidelines at
      http://java.sun.com/j2se/javadoc/writingdoccomments/index.html

    - the JDK Api documentation http://java.sun.com/j2se/docs/api/index.html

    - some best practices

  Checkstyle is very configurable. Be sure to read the documentation at
  http://checkstyle.sf.net (or in your downloaded distribution).

  Most Checks are configurable, be sure to consult the documentation.

  To completely disable a check, just comment it out or delete it from the file.

  Finally, it is worth reading the documentation.

-->

<module name="Checker">
  <module name="TreeWalker">

    <!-- Checks for imports                              -->
    <!-- See http://checkstyle.sf.net/config_import.html -->
    <module name="AvoidStarImport"/>
    <module name="IllegalImport"/> <!-- defaults to sun.* packages -->
    <module name="RedundantImport"/>
    <module name="UnusedImports"/>


    <!-- Checks for Size Violations.                    -->
    <!-- See http://checkstyle.sf.net/config_sizes.html -->



    <!-- Modifier Checks                                    -->
    <!-- See http://checkstyle.sf.net/config_modifiers.html -->

    <!-- Checks for blocks. You know, those {}'s         -->