RAT (Release Audit Tool) results

The following document contains the results of RAT (Release Audit Tool).

*****************************************************
Summary
-------
Notes: 3
Binaries: 0
Archives: 0
Standards: 11

Apache Licensed: 0
Generated Documents: 0

JavaDocs are generated and so license header is optional
Generated files do not required license headers

11 Unknown Licenses

*******************************

Archives (+ indicates readable, $ unreadable): 

 
*****************************************************
  Files with AL headers will be marked L
  Binary files (which do not require AL headers) will be marked B
  Compressed archives will be marked A
  Notices, licenses etc will be marked N
  N     CHANGELOG.txt
  N     LICENSE.txt
 !????? pom.xml
  N     README.txt
 !????? src/conf/modules/Processor.options
 !????? src/main/java/com/powerset/heritrix/writer/HBaseWriter.java
 !????? src/main/java/com/powerset/heritrix/writer/HBaseWriterPool.java
 !????? src/main/java/com/powerset/heritrix/writer/HBaseWriterProcessor.java
 !????? src/main/java/com/powerset/heritrix/writer/package.html
 !????? src/main/resources/checkstyle.xml
 !????? src/main/resources/com/powerset/heritrix/writer/HBaseWriterProcessor_en.utf8
 !????? src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriter.java
 !????? src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterPool.java
 !????? src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterProcessor.java
 
 *****************************************************
 Printing headers for files without AL header...
 
 
 =======================================================================
 ==pom.xml
 =======================================================================
 <project xmlns="http://maven.apache.org/POM/4.0.0" 
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  	
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.powerset.heritrix</groupId>
  <artifactId>hbase-writer</artifactId>
  <packaging>jar</packaging>
  <version>0.20.3-SNAPSHOT</version>
  <name>HBase Writer</name>
  <description>A library for writing Heritrix2 output directly as records to an HBase table.</description>
  <url>http://code.google.com/p/hbase-writer</url>
  
  <inceptionYear>2007</inceptionYear>
  
  <issueManagement>
    <system>GoogleCode Issues</system>
    <url>http://code.google.com/p/hbase-writer/issues/list</url>
  </issueManagement>
  
  <mailingLists>
    <mailingList>
      <name>User List</name>
      <subscribe>hbase-user-subscribe@hadoop.apache.org</subscribe>
      <unsubscribe>hbase-user-unsubscribe@hadoop.apache.org</unsubscribe>
      <post>hbase-user@hadoop.apache.org</post>
      <archive>http://mail-archives.apache.org/mod_mbox/hadoop-hbase-user/</archive>
    </mailingList>
    
    <mailingList>
      <name>Dev List</name>
      <subscribe>hbase-dev-subscribe@hadoop.apache.org</subscribe>
      <unsubscribe>hbase-dev-unsubscribe@hadoop.apache.org</unsubscribe>
      <post>hbase-dev@hadoop.apache.org</post>
      <archive>http://mail-archives.apache.org/mod_mbox/hadoop-hbase-dev/</archive>
    </mailingList>
    
    <mailingList>
      <name>Commits List</name>
      <subscribe>hbase-commits-subscribe@hadoop.apache.org</subscribe>
      <unsubscribe>hbase-commits-unsubscribe@hadoop.apache.org</unsubscribe>
      <post>hbase-commits@hadoop.apache.org</post>
      <archive>http://mail-archives.apache.org/mod_mbox/hadoop-hbase-commits/</archive>
    </mailingList>
  </mailingLists>

  <licenses>
    <license>
      <name>LGPL</name>
      <url>http://www.gnu.org/copyleft/lesser.html</url>

 =======================================================================
 ==src/conf/modules/Processor.options
 =======================================================================
 # Availible processors.
# Each processor class should be listed with full package info
# followed by a '|' and a descriptive name (containing only [a-z,A-z])
# Lines beginning with # and empty lines are ignored
com.powerset.heritrix.writer.HBaseWriterProcessor|HBaseArchiver

 =======================================================================
 ==src/main/java/com/powerset/heritrix/writer/HBaseWriter.java
 =======================================================================
 /**
 * HBaseWriter
 *
 * $Id$
 *
 * Created on June 23rd, 2007
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package com.powerset.heritrix.writer;

import java.io.ByteArrayOutputStream;
import java.io.IOException;

import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Keying;
import org.apache.log4j.Logger;
import org.archive.io.ArchiveFileConstants;
import org.archive.io.RecordingInputStream;
import org.archive.io.RecordingOutputStream;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPoolMember;
import org.archive.modules.ProcessorURI;

/**
 * Write crawled content as records to an HBase table. 
 * Puts content into the 'content:raw_data' column and all else into the
 * 'curi:' column family. Makes a row key of an url transformation. Creates
 * table if it does not exist.

 =======================================================================
 ==src/main/java/com/powerset/heritrix/writer/HBaseWriterPool.java
 =======================================================================
 /** HBaseWriterPool
 *
 * $Id$
 *
 * Created on June 23rd, 2007
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package com.powerset.heritrix.writer;

import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.pool.BasePoolableObjectFactory;
import org.archive.io.DefaultWriterPoolSettings;
import org.archive.io.WriterPool;
import org.archive.io.WriterPoolMember;

/**
 * A pool of HBaseWriters.
 */
public class HBaseWriterPool extends WriterPool {
	
	/**
	 * Constructor.
	 * 
	 * @param zkQuorum the list of zookeeper quorum servers that serve HBase, comma seperated.  
	 * 			i.e.:  zkHost1,zkHost2,zkHost3
	 * @param table the table name in HBase
	 * @param poolMaximumActive the pool maximum active
	 * @param poolMaximumWait the pool maximum wait
	 */
	public HBaseWriterPool(final String zkQuorum, final int zkClientPort, final String table, final int poolMaximumActive, final int poolMaximumWait) {
		// Below is hard to follow. Its invocation of this classes super
		// constructor passing a serial, an instance of BasePoolable.. that
		// is defined in line, followed by settings, max and wait.
		super(

 =======================================================================
 ==src/main/java/com/powerset/heritrix/writer/HBaseWriterProcessor.java
 =======================================================================
 /**
 * HBaseWriterProcessor
 *
 * $Id$
 *
 * Created on June 23rd, 2007
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package com.powerset.heritrix.writer;

import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;

import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Keying;
import org.apache.log4j.Logger;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPool;
import org.archive.io.WriterPoolMember;
import org.archive.modules.ModuleAttributeConstants;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.archive.modules.ProcessorURI;
import org.archive.modules.fetcher.FetchStatusCodes;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.archive.modules.net.ServerCacheUtil;
import org.archive.state.Expert;
import org.archive.state.Immutable;
import org.archive.state.Initializable;
import org.archive.state.Key;
import org.archive.state.KeyManager;

 =======================================================================
 ==src/main/java/com/powerset/heritrix/writer/package.html
 =======================================================================
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head />
<body bgcolor="white">
Provides <a href="http://hbase.org">HBase</a> writer for
<a href="http://crawler.archive.org">heritrix</a>.

<h2>Requirements</h2>
TODO
</body>
</html>

 =======================================================================
 ==src/main/resources/checkstyle.xml
 =======================================================================
 <?xml version="1.0"?>
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.2//EN" "http://www.puppycrawl.com/dtds/configuration_1_2.dtd">

<!--

  Checkstyle configuration that checks the sun coding conventions from:

    - the Java Language Specification at
      http://java.sun.com/docs/books/jls/second_edition/html/index.html

    - the Sun Code Conventions at http://java.sun.com/docs/codeconv/

    - the Javadoc guidelines at
      http://java.sun.com/j2se/javadoc/writingdoccomments/index.html

    - the JDK Api documentation http://java.sun.com/j2se/docs/api/index.html

    - some best practices

  Checkstyle is very configurable. Be sure to read the documentation at
  http://checkstyle.sf.net (or in your downloaded distribution).

  Most Checks are configurable, be sure to consult the documentation.

  To completely disable a check, just comment it out or delete it from the file.

  Finally, it is worth reading the documentation.

-->

<module name="Checker">
  <module name="TreeWalker">

    <!-- Checks for imports                              -->
    <!-- See http://checkstyle.sf.net/config_import.html -->
    <module name="AvoidStarImport"/>
    <module name="IllegalImport"/> <!-- defaults to sun.* packages -->
    <module name="RedundantImport"/>
    <module name="UnusedImports"/>


    <!-- Checks for Size Violations.                    -->
    <!-- See http://checkstyle.sf.net/config_sizes.html -->



    <!-- Modifier Checks                                    -->
    <!-- See http://checkstyle.sf.net/config_modifiers.html -->

    <!-- Checks for blocks. You know, those {}'s         -->

 =======================================================================
 ==src/main/resources/com/powerset/heritrix/writer/HBaseWriterProcessor_en.utf8
 =======================================================================
 description:
HBaseWriter processor.

server-cache-description:
The server cache used to resolve IP addresses.

pool-max-active-description:
Maximum active files in pool. This setting cannot be varied over the life
of a crawl.

pool-max-wait-description:
Maximum time to wait on pool element (milliseconds). This setting cannot
be varied over the life of a crawl.

total-bytes-to-write-description:
Total file bytes to write to disk. Once the size of all files on disk has
exceeded this limit, this processor will stop the crawler. A value of
zero means no upper limit.

table-description:
Name of the HBase table to write crawl to

zkquorum-description:
quorum of zk hosts that describe where the hbase master is.

zkclientport-description:
The zookeeper quroum client port that clients should connect to to get HBase information.

write-only-new-records-description:
If set to true, only write new rowkey records (urls) to hbase.  
Default is set to false, which writes all urls that are crawled.

process-only-new-records-description:
If set to true, only process (fetch and parse) new rowkey records (urls) to crawl.  
Default is set to false, which fetchs and parses all urls, even if they exist in hbase.

 =======================================================================
 ==src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriter.java
 =======================================================================
 package com.powerset.heritrix.writer.test;

import java.io.IOException;

import org.testng.Assert;
import org.testng.annotations.Test;

import com.powerset.heritrix.writer.HBaseWriter;

// TODO: Auto-generated Javadoc
/**
 * The Class TestHBaseWriter.
 */
public class TestHBaseWriter {
	
	/** The master. */
	String zkQuorum = "localhost";
	
	int zkClientPort = 2181;
	
	/** The table. */
	String table = "test";
	
	/** The pool maximum active. */
	int poolMaximumActive = 10;
	
	/** The pool maximum wait. */
	int poolMaximumWait = 20;

	/** The hw. */
	HBaseWriter hw;

	/**
	 * Test that bad table values cannot be used when creating an instance of
	 * HbaseWriter.
	 * 
	 * @throws IOException Signals that an I/O exception has occurred.
	 */
	@Test()
	public void testCreateHBaseWriter() throws IOException {
		// Test
		try {
			hw = new HBaseWriter(zkQuorum,zkClientPort,null);
			Assert.assertNull(hw);
		} catch (IllegalArgumentException e) {
			Assert.assertNotNull(e);
		}

		try {
			hw = new HBaseWriter(zkQuorum, zkClientPort, "");

 =======================================================================
 ==src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterPool.java
 =======================================================================
 package com.powerset.heritrix.writer.test;

import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;

import com.powerset.heritrix.writer.HBaseWriterPool;

// TODO: Auto-generated Javadoc
/**
 * The Class TestHBaseWriterPool.
 */
public class TestHBaseWriterPool {
	
	/** The master. */
	String zkQuorum = "locahost";
	
	int clientPort = 2181;
	
	/** The table. */
	String table = "test";
	
	/** The pool maximum active. */
	int poolMaximumActive = 10;
	
	/** The pool maximum wait. */
	int poolMaximumWait = 20;

	/** The hwp. */
	HBaseWriterPool hwp;

	/**
	 * Creates the h base writer pool.
	 */
	@BeforeClass()
	public void createHBaseWriterPool() {
		hwp = new HBaseWriterPool(zkQuorum, clientPort, table, poolMaximumActive, poolMaximumWait);
	}

	/**
	 * Test h base writer pool integrity.
	 */
	@Test()
	public void testHBaseWriterPoolIntegrity() {
		Assert.assertNotNull(hwp);
		Assert.assertEquals(hwp.getNumActive(), 0);
		Assert.assertEquals(hwp.getNumIdle(), 0);
		Assert.assertEquals(hwp.getSerialNo().intValue(), 0);
		Assert.assertFalse(hwp.getSettings().isCompressed());
		Assert.assertNull(hwp.getSettings().getPrefix());

 =======================================================================
 ==src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterProcessor.java
 =======================================================================
 package com.powerset.heritrix.writer.test;

import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;

import com.powerset.heritrix.writer.HBaseWriterProcessor;

// TODO: Auto-generated Javadoc
/**
 * The Class TestHBaseWriterProcessor.
 * 
 * @author rsmith
 * 
 * TODO: mock objects should be used here to test the api integrity.
 */
public class TestHBaseWriterProcessor {
	
	/** The hwproc. */
	HBaseWriterProcessor hwproc;

	/**
	 * Creates the h base writer processor.
	 */
	@BeforeClass()
	public void createHBaseWriterProcessor() {
		hwproc = new HBaseWriterProcessor();
	}

	/**
	 * Test h base writer processor integrity.
	 */
	@Test()
	public void testHBaseWriterProcessorIntegrity() {
		Assert.assertNotNull(hwproc);
		Assert.assertEquals(hwproc.getURICount(), 0);
	}
	
	
}