RAT (Release Audit Tool) results

The following document contains the results of RAT (Release Audit Tool) .

*****************************************************
Summary
-------
Notes: 3
Binaries: 0
Archives: 0
Standards: 12

Apache Licensed: 0
Generated Documents: 0

JavaDocs are generated and so license header is optional
Generated files do not required license headers

12 Unknown Licenses

*******************************

Archives (+ indicates readable, $ unreadable): 

 
*****************************************************
  Files with AL headers will be marked L
  Binary files (which do not require AL headers) will be marked B
  Compressed archives will be marked A
  Notices, licenses etc will be marked N
 !????? pom.xml
  N     CHANGELOG.txt
 !????? src/conf/modules/Processor.options
 !????? src/main/java/com/powerset/heritrix/writer/HBaseWriterPool.java
 !????? src/main/java/com/powerset/heritrix/writer/HBaseWriterProcessor.java
 !????? src/main/java/com/powerset/heritrix/writer/package.html
 !????? src/main/java/com/powerset/heritrix/writer/HBaseWriter.java
 !????? src/main/resources/com/powerset/heritrix/writer/HBaseWriterProcessor_en.utf8
 !????? src/main/resources/checkstyle.xml
 !????? src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriter.java
 !????? src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterPool.java
 !????? src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterProcessor.java
 !????? .checkstyle
  N     README.txt
  N     LICENSE.txt
 
 *****************************************************
 Printing headers for files without AL header...
 
 
 =======================================================================
 ==pom.xml
 =======================================================================
 <project xmlns="http://maven.apache.org/POM/4.0.0" 
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  	
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.powerset.heritrix</groupId>
  <artifactId>hbase-writer</artifactId>
  <packaging>jar</packaging>
  <version>0.19.2-SNAPSHOT</version>
  <name>HBase Writer</name>
  <description>A library for writing Heritrix output directly to HBase tables.</description>
  <url>http://code.google.com/p/hbase-writer</url>
  
  <inceptionYear>2007</inceptionYear>
  
  <issueManagement>
    <system>GoogleCode Issues</system>
    <url>http://code.google.com/p/hbase-writer/issues/list</url>
  </issueManagement>
  
  <mailingLists>
    <mailingList>
      <name>User List</name>
      <subscribe>hbase-user-subscribe@hadoop.apache.org</subscribe>
      <unsubscribe>hbase-user-unsubscribe@hadoop.apache.org</unsubscribe>
      <post>hbase-user@hadoop.apache.org</post>
      <archive>http://mail-archives.apache.org/mod_mbox/hadoop-hbase-user/</archive>
    </mailingList>
    
    <mailingList>
      <name>Dev List</name>
      <subscribe>hbase-dev-subscribe@hadoop.apache.org</subscribe>
      <unsubscribe>hbase-dev-unsubscribe@hadoop.apache.org</unsubscribe>
      <post>hbase-dev@hadoop.apache.org</post>
      <archive>http://mail-archives.apache.org/mod_mbox/hadoop-hbase-dev/</archive>
    </mailingList>
    
    <mailingList>
      <name>Commits List</name>
      <subscribe>hbase-commits-subscribe@hadoop.apache.org</subscribe>
      <unsubscribe>hbase-commits-unsubscribe@hadoop.apache.org</unsubscribe>
      <post>hbase-commits@hadoop.apache.org</post>
      <archive>http://mail-archives.apache.org/mod_mbox/hadoop-hbase-commits/</archive>
    </mailingList>
  </mailingLists>

  <licenses>
    <license>
      <name>LGPL</name>
      <url>http://www.gnu.org/copyleft/lesser.html</url>

 =======================================================================
 ==src/conf/modules/Processor.options
 =======================================================================
 # Availible processors.
# Each processor class should be listed with full package info
# followed by a '|' and a descriptive name (containing only [a-z,A-z])
# Lines beginning with # and empty lines are ignored
com.powerset.heritrix.writer.HBaseWriterProcessor|HBaseArchiver

 =======================================================================
 ==src/main/java/com/powerset/heritrix/writer/HBaseWriterPool.java
 =======================================================================
 /** HBaseWriterPool
 *
 * $Id$
 *
 * Created on June 23rd, 2007
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package com.powerset.heritrix.writer;

import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.pool.BasePoolableObjectFactory;
import org.archive.io.DefaultWriterPoolSettings;
import org.archive.io.WriterPool;
import org.archive.io.WriterPoolMember;

// TODO: Auto-generated Javadoc
/**
 * A pool of HBaseWriters.
 */
public class HBaseWriterPool extends WriterPool {
	
	/**
	 * Constructor.
	 * 
	 * @param poolMaximumActive the pool maximum active
	 * @param poolMaximumWait the pool maximum wait
	 * @param master the master
	 * @param table the table
	 */
	public HBaseWriterPool(final String master, final String table, final int poolMaximumActive, final int poolMaximumWait) {
		// Below is hard to follow. Its invocation of this classes super
		// constructor passing a serial, an instance of BasePoolable.. that
		// is defined in line, followed by settings, max and wait.
		super(new AtomicInteger(), new BasePoolableObjectFactory() {

 =======================================================================
 ==src/main/java/com/powerset/heritrix/writer/HBaseWriterProcessor.java
 =======================================================================
 /**
 * HBaseWriterProcessor
 *
 * $Id$
 *
 * Created on June 23rd, 2007
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package com.powerset.heritrix.writer;

import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;

import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.util.Keying;
import org.apache.log4j.Logger;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPool;
import org.archive.io.WriterPoolMember;
import org.archive.modules.ModuleAttributeConstants;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.archive.modules.ProcessorURI;
import org.archive.modules.fetcher.FetchStatusCodes;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.archive.modules.net.ServerCacheUtil;
import org.archive.state.Expert;
import org.archive.state.Immutable;
import org.archive.state.Initializable;
import org.archive.state.Key;
import org.archive.state.KeyManager;

 =======================================================================
 ==src/main/java/com/powerset/heritrix/writer/package.html
 =======================================================================
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head />
<body bgcolor="white">
Provides <a href="http://hbase.org">HBase</a> writer for
<a href="http://crawler.archive.org">heritrix</a>.

<h2>Requirements</h2>
TODO
</body>
</html>

 =======================================================================
 ==src/main/java/com/powerset/heritrix/writer/HBaseWriter.java
 =======================================================================
 /**
 * HBaseWriter
 *
 * $Id$
 *
 * Created on June 23rd, 2007
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package com.powerset.heritrix.writer;

import java.io.ByteArrayOutputStream;
import java.io.IOException;

import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.io.BatchUpdate;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Keying;
import org.apache.log4j.Logger;
import org.archive.io.ArchiveFileConstants;
import org.archive.io.RecordingInputStream;
import org.archive.io.RecordingOutputStream;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPoolMember;
import org.archive.modules.ProcessorURI;

// TODO: Auto-generated Javadoc
/**
 * Write to HBase. Puts content into the 'content:' column and all else into the
 * 'curi:' column family. Makes a row key of an url transformation. Creates
 * table if it does not exist.

 =======================================================================
 ==src/main/resources/com/powerset/heritrix/writer/HBaseWriterProcessor_en.utf8
 =======================================================================
 description:
HBaseWriter processor.

server-cache-description:
The server cache used to resolve IP addresses.

pool-max-active-description:
Maximum active files in pool. This setting cannot be varied over the life
of a crawl.


pool-max-wait-description:
Maximum time to wait on pool element (milliseconds). This setting cannot
be varied over the life of a crawl.

total-bytes-to-write-description:
Total file bytes to write to disk. Once the size of all files on disk has
exceeded this limit, this processor will stop the crawler. A value of
zero means no upper limit.

table-description:
Name of the HBase table to write crawl to

master-description:
Master host and port: e.g. localhost:60010

 =======================================================================
 ==src/main/resources/checkstyle.xml
 =======================================================================
 <?xml version="1.0"?>
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.2//EN" "http://www.puppycrawl.com/dtds/configuration_1_2.dtd">

<!--

  Checkstyle configuration that checks the sun coding conventions from:

    - the Java Language Specification at
      http://java.sun.com/docs/books/jls/second_edition/html/index.html

    - the Sun Code Conventions at http://java.sun.com/docs/codeconv/

    - the Javadoc guidelines at
      http://java.sun.com/j2se/javadoc/writingdoccomments/index.html

    - the JDK Api documentation http://java.sun.com/j2se/docs/api/index.html

    - some best practices

  Checkstyle is very configurable. Be sure to read the documentation at
  http://checkstyle.sf.net (or in your downloaded distribution).

  Most Checks are configurable, be sure to consult the documentation.

  To completely disable a check, just comment it out or delete it from the file.

  Finally, it is worth reading the documentation.

-->

<module name="Checker">
  <module name="TreeWalker">

    <!-- Checks for imports                              -->
    <!-- See http://checkstyle.sf.net/config_import.html -->
    <module name="AvoidStarImport"/>
    <module name="IllegalImport"/> <!-- defaults to sun.* packages -->
    <module name="RedundantImport"/>
    <module name="UnusedImports"/>


    <!-- Checks for Size Violations.                    -->
    <!-- See http://checkstyle.sf.net/config_sizes.html -->



    <!-- Modifier Checks                                    -->
    <!-- See http://checkstyle.sf.net/config_modifiers.html -->

    <!-- Checks for blocks. You know, those {}'s         -->

 =======================================================================
 ==src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriter.java
 =======================================================================
 package com.powerset.heritrix.writer.test;

import java.io.IOException;

import org.testng.Assert;
import org.testng.annotations.Test;

import com.powerset.heritrix.writer.HBaseWriter;

// TODO: Auto-generated Javadoc
/**
 * The Class TestHBaseWriter.
 */
public class TestHBaseWriter {
	
	/** The master. */
	String master = "localhost:60000";
	
	/** The table. */
	String table = "test";
	
	/** The pool maximum active. */
	int poolMaximumActive = 10;
	
	/** The pool maximum wait. */
	int poolMaximumWait = 20;

	/** The hw. */
	HBaseWriter hw;

	/**
	 * Test that bad table values cannot be used when creating an instance of
	 * HbaseWriter.
	 * 
	 * @throws IOException Signals that an I/O exception has occurred.
	 */
	@Test()
	public void testCreateHBaseWriter() throws IOException {
		// Test
		try {
			hw = new HBaseWriter(master, null);
			Assert.assertNull(hw);
		} catch (IllegalArgumentException e) {
			Assert.assertNotNull(e);
		}

		try {
			hw = new HBaseWriter(master, "");
			Assert.assertNull(hw);
		} catch (IllegalArgumentException e) {

 =======================================================================
 ==src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterPool.java
 =======================================================================
 package com.powerset.heritrix.writer.test;

import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;

import com.powerset.heritrix.writer.HBaseWriterPool;

// TODO: Auto-generated Javadoc
/**
 * The Class TestHBaseWriterPool.
 */
public class TestHBaseWriterPool {
	
	/** The master. */
	String master = "locahost";
	
	/** The table. */
	String table = "test";
	
	/** The pool maximum active. */
	int poolMaximumActive = 10;
	
	/** The pool maximum wait. */
	int poolMaximumWait = 20;

	/** The hwp. */
	HBaseWriterPool hwp;

	/**
	 * Creates the h base writer pool.
	 */
	@BeforeClass()
	public void createHBaseWriterPool() {
		hwp = new HBaseWriterPool(master, table, poolMaximumActive,
				poolMaximumWait);
	}

	/**
	 * Test h base writer pool integrity.
	 */
	@Test()
	public void testHBaseWriterPoolIntegrity() {
		Assert.assertNotNull(hwp);
		Assert.assertEquals(hwp.getNumActive(), 0);
		Assert.assertEquals(hwp.getNumIdle(), 0);
		Assert.assertEquals(hwp.getSerialNo().intValue(), 0);
		Assert.assertFalse(hwp.getSettings().isCompressed());
		Assert.assertNull(hwp.getSettings().getPrefix());
		Assert.assertNull(hwp.getSettings().getSuffix());

 =======================================================================
 ==src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterProcessor.java
 =======================================================================
 package com.powerset.heritrix.writer.test;

import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;

import com.powerset.heritrix.writer.HBaseWriterProcessor;

// TODO: Auto-generated Javadoc
/**
 * The Class TestHBaseWriterProcessor.
 * 
 * @author rsmith
 * 
 * TODO: mock objects should be used here to test the api integrity.
 */
public class TestHBaseWriterProcessor {
	
	/** The hwproc. */
	HBaseWriterProcessor hwproc;

	/**
	 * Creates the h base writer processor.
	 */
	@BeforeClass()
	public void createHBaseWriterProcessor() {
		hwproc = new HBaseWriterProcessor();
	}

	/**
	 * Test h base writer processor integrity.
	 */
	@Test()
	public void testHBaseWriterProcessorIntegrity() {
		Assert.assertNotNull(hwproc);
		Assert.assertEquals(hwproc.getURICount(), 0);
	}
	
	
}

 =======================================================================
 ==.checkstyle
 =======================================================================
 <?xml version="1.0" encoding="UTF-8"?>
<fileset-config file-format-version="1.2.0" simple-config="true">
    <local-check-config name="HBase-Writer checkstyle" location="src/main/resources/checkstyle.xml" type="project" description="HBase-Writer checkstyle">
        <additional-data name="protect-config-file" value="false"/>
    </local-check-config>
    <fileset name="all" enabled="true" check-config-name="HBase-Writer checkstyle" local="true">
        <file-match-pattern match-pattern="." include-pattern="true"/>
    </fileset>
</fileset-config>