The following document contains the results of RAT (Release Audit Tool).
*****************************************************
Summary
-------
Notes: 3
Binaries: 0
Archives: 0
Standards: 11
Apache Licensed: 0
Generated Documents: 0
JavaDocs are generated and so license header is optional
Generated files do not required license headers
11 Unknown Licenses
*******************************
Archives (+ indicates readable, $ unreadable):
*****************************************************
Files with AL headers will be marked L
Binary files (which do not require AL headers) will be marked B
Compressed archives will be marked A
Notices, licenses etc will be marked N
N CHANGELOG.txt
N LICENSE.txt
!????? pom.xml
N README.txt
!????? src/conf/modules/Processor.options
!????? src/main/java/com/powerset/heritrix/writer/HBaseWriter.java
!????? src/main/java/com/powerset/heritrix/writer/HBaseWriterPool.java
!????? src/main/java/com/powerset/heritrix/writer/HBaseWriterProcessor.java
!????? src/main/java/com/powerset/heritrix/writer/package.html
!????? src/main/resources/checkstyle.xml
!????? src/main/resources/com/powerset/heritrix/writer/HBaseWriterProcessor_en.utf8
!????? src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriter.java
!????? src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterPool.java
!????? src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterProcessor.java
*****************************************************
Printing headers for files without AL header...
=======================================================================
==pom.xml
=======================================================================
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.powerset.heritrix</groupId>
<artifactId>hbase-writer</artifactId>
<packaging>jar</packaging>
<version>0.20.3-SNAPSHOT</version>
<name>HBase Writer</name>
<description>A library for writing Heritrix2 output directly to an HBase table as records.</description>
<url>http://code.google.com/p/hbase-writer</url>
<inceptionYear>2007</inceptionYear>
<issueManagement>
<system>GoogleCode Issues</system>
<url>http://code.google.com/p/hbase-writer/issues/list</url>
</issueManagement>
<mailingLists>
<mailingList>
<name>User List</name>
<subscribe>hbase-user-subscribe@hadoop.apache.org</subscribe>
<unsubscribe>hbase-user-unsubscribe@hadoop.apache.org</unsubscribe>
<post>hbase-user@hadoop.apache.org</post>
<archive>http://mail-archives.apache.org/mod_mbox/hadoop-hbase-user/</archive>
</mailingList>
<mailingList>
<name>Dev List</name>
<subscribe>hbase-dev-subscribe@hadoop.apache.org</subscribe>
<unsubscribe>hbase-dev-unsubscribe@hadoop.apache.org</unsubscribe>
<post>hbase-dev@hadoop.apache.org</post>
<archive>http://mail-archives.apache.org/mod_mbox/hadoop-hbase-dev/</archive>
</mailingList>
<mailingList>
<name>Commits List</name>
<subscribe>hbase-commits-subscribe@hadoop.apache.org</subscribe>
<unsubscribe>hbase-commits-unsubscribe@hadoop.apache.org</unsubscribe>
<post>hbase-commits@hadoop.apache.org</post>
<archive>http://mail-archives.apache.org/mod_mbox/hadoop-hbase-commits/</archive>
</mailingList>
</mailingLists>
<licenses>
<license>
<name>LGPL</name>
<url>http://www.gnu.org/copyleft/lesser.html</url>
=======================================================================
==src/conf/modules/Processor.options
=======================================================================
# Availible processors.
# Each processor class should be listed with full package info
# followed by a '|' and a descriptive name (containing only [a-z,A-z])
# Lines beginning with # and empty lines are ignored
com.powerset.heritrix.writer.HBaseWriterProcessor|HBaseArchiver
=======================================================================
==src/main/java/com/powerset/heritrix/writer/HBaseWriter.java
=======================================================================
/**
* HBaseWriter
*
* $Id$
*
* Created on June 23rd, 2007
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.powerset.heritrix.writer;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Keying;
import org.apache.hadoop.io.IOUtils;
import org.apache.log4j.Logger;
import org.archive.io.RecordingInputStream;
import org.archive.io.RecordingOutputStream;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPoolMember;
import org.archive.modules.ProcessorURI;
// TODO: Auto-generated Javadoc
/**
* Write crawled content as records to an HBase table.
* Puts content into the 'content:raw_data' column and all else into the
* 'curi:' column family. Makes a row key of an url transformation. Creates
=======================================================================
==src/main/java/com/powerset/heritrix/writer/HBaseWriterPool.java
=======================================================================
/** HBaseWriterPool
*
* $Id$
*
* Created on June 23rd, 2007
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.powerset.heritrix.writer;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.pool.BasePoolableObjectFactory;
import org.archive.io.DefaultWriterPoolSettings;
import org.archive.io.WriterPool;
import org.archive.io.WriterPoolMember;
// TODO: Auto-generated Javadoc
/**
* A pool of HBaseWriters.
*/
public class HBaseWriterPool extends WriterPool {
/**
* Constructor.
*
* @param zkQuorum the list of zookeeper quorum servers that serve HBase, comma seperated.
* i.e.: zkHost1,zkHost2,zkHost3
* @param zkClientPort the port that clients should connect to on the given zk quorum servers.
* i.e.: 2181
* @param table the table name in HBase
* @param poolMaximumActive the maximum number of writers in the writer pool.
* @param poolMaximumWait the maximum waittime for all writers in the pool.
*/
public HBaseWriterPool(final String zkQuorum, final int zkClientPort, final String table, final int poolMaximumActive, final int poolMaximumWait) {
// Below is hard to follow. Its invocation of this classes super
=======================================================================
==src/main/java/com/powerset/heritrix/writer/HBaseWriterProcessor.java
=======================================================================
/**
* HBaseWriterProcessor
*
* $Id$
*
* Created on June 23rd, 2007
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.powerset.heritrix.writer;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Keying;
import org.apache.log4j.Logger;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPool;
import org.archive.io.WriterPoolMember;
import org.archive.modules.ModuleAttributeConstants;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.archive.modules.ProcessorURI;
import org.archive.modules.fetcher.FetchStatusCodes;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.archive.modules.net.ServerCacheUtil;
import org.archive.state.Expert;
import org.archive.state.Immutable;
import org.archive.state.Initializable;
import org.archive.state.Key;
import org.archive.state.KeyManager;
=======================================================================
==src/main/java/com/powerset/heritrix/writer/package.html
=======================================================================
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head />
<body bgcolor="white">
Provides <a href="http://hbase.org">HBase</a> writer for
<a href="http://crawler.archive.org">heritrix</a>.
<h2>Requirements</h2>
TODO
</body>
</html>
=======================================================================
==src/main/resources/checkstyle.xml
=======================================================================
<?xml version="1.0"?>
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.2//EN" "http://www.puppycrawl.com/dtds/configuration_1_2.dtd">
<!--
Checkstyle configuration that checks the sun coding conventions from:
- the Java Language Specification at
http://java.sun.com/docs/books/jls/second_edition/html/index.html
- the Sun Code Conventions at http://java.sun.com/docs/codeconv/
- the Javadoc guidelines at
http://java.sun.com/j2se/javadoc/writingdoccomments/index.html
- the JDK Api documentation http://java.sun.com/j2se/docs/api/index.html
- some best practices
Checkstyle is very configurable. Be sure to read the documentation at
http://checkstyle.sf.net (or in your downloaded distribution).
Most Checks are configurable, be sure to consult the documentation.
To completely disable a check, just comment it out or delete it from the file.
Finally, it is worth reading the documentation.
-->
<module name="Checker">
<module name="TreeWalker">
<!-- Checks for imports -->
<!-- See http://checkstyle.sf.net/config_import.html -->
<module name="AvoidStarImport"/>
<module name="IllegalImport"/> <!-- defaults to sun.* packages -->
<module name="RedundantImport"/>
<module name="UnusedImports"/>
<!-- Checks for Size Violations. -->
<!-- See http://checkstyle.sf.net/config_sizes.html -->
<!-- Modifier Checks -->
<!-- See http://checkstyle.sf.net/config_modifiers.html -->
<!-- Checks for blocks. You know, those {}'s -->
=======================================================================
==src/main/resources/com/powerset/heritrix/writer/HBaseWriterProcessor_en.utf8
=======================================================================
description:
HBaseWriter processor.
server-cache-description:
The server cache used to resolve IP addresses.
pool-max-active-description:
Maximum active files in pool. This setting cannot be varied over the life
of a crawl.
pool-max-wait-description:
Maximum time to wait on pool element (milliseconds). This setting cannot
be varied over the life of a crawl.
total-bytes-to-write-description:
Total file bytes to write to disk. Once the size of all files on disk has
exceeded this limit, this processor will stop the crawler. A value of
zero means no upper limit.
table-description:
Name of the HBase table to write crawl to
zkquorum-description:
quorum of zk hosts that describe where the hbase master is.
zkclientport-description:
The zookeeper quroum client port that clients should connect to to get HBase information.
write-only-new-records-description:
If set to true, only write new rowkey records (urls) to hbase.
Default is set to false, which writes all urls that are crawled.
process-only-new-records-description:
If set to true, only process (fetch and parse) new rowkey records (urls) to crawl.
Default is set to false, which fetchs and parses all urls, even if they exist in hbase.
=======================================================================
==src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriter.java
=======================================================================
package com.powerset.heritrix.writer.test;
import java.io.IOException;
import org.testng.Assert;
import org.testng.annotations.Test;
import com.powerset.heritrix.writer.HBaseWriter;
/**
* The Class TestHBaseWriter.
*/
public class TestHBaseWriter {
/** The zkQuorum. */
String zkQuorum = "localhost";
/** The zkClientPort. */
int zkClientPort = 2181;
/** The table. */
String table = "test";
/** The pool maximum active. */
int poolMaximumActive = 10;
/** The pool maximum wait. */
int poolMaximumWait = 20;
/** The hw. */
HBaseWriter hw;
/**
* Test that bad table values cannot be used when creating an instance of
* HbaseWriter.
*
* @throws IOException Signals that an I/O exception has occurred.
*/
@Test()
public void testCreateHBaseWriter() throws IOException {
// Test
try {
hw = new HBaseWriter(zkQuorum,zkClientPort,null);
Assert.assertNull(hw);
} catch (IllegalArgumentException e) {
Assert.assertNotNull(e);
}
try {
hw = new HBaseWriter(zkQuorum, zkClientPort, "");
=======================================================================
==src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterPool.java
=======================================================================
package com.powerset.heritrix.writer.test;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import com.powerset.heritrix.writer.HBaseWriterPool;
/**
* The Class TestHBaseWriterPool.
*/
public class TestHBaseWriterPool {
/** The zkQuorum. */
String zkQuorum = "locahost";
/** zkClientPort. */
int zkClientPort = 2181;
/** The table. */
String table = "test";
/** The pool maximum active. */
int poolMaximumActive = 10;
/** The pool maximum wait. */
int poolMaximumWait = 20;
/** The hwp. */
HBaseWriterPool hwp;
/**
* Creates the h base writer pool.
*/
@BeforeClass()
public void createHBaseWriterPool() {
hwp = new HBaseWriterPool(zkQuorum, zkClientPort, table, poolMaximumActive, poolMaximumWait);
}
/**
* Test h base writer pool integrity.
*/
@Test()
public void testHBaseWriterPoolIntegrity() {
Assert.assertNotNull(hwp);
Assert.assertEquals(hwp.getNumActive(), 0);
Assert.assertEquals(hwp.getNumIdle(), 0);
Assert.assertEquals(hwp.getSerialNo().intValue(), 0);
Assert.assertFalse(hwp.getSettings().isCompressed());
Assert.assertNull(hwp.getSettings().getPrefix());
=======================================================================
==src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterProcessor.java
=======================================================================
package com.powerset.heritrix.writer.test;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import com.powerset.heritrix.writer.HBaseWriterProcessor;
/**
* The Class TestHBaseWriterProcessor.
*
* @author rsmith
*
* TODO: mock objects should be used here to test the api integrity.
*/
public class TestHBaseWriterProcessor {
/** The hwproc. */
HBaseWriterProcessor hwproc;
/**
* Creates the h base writer processor.
*/
@BeforeClass()
public void createHBaseWriterProcessor() {
hwproc = new HBaseWriterProcessor();
}
/**
* Test h base writer processor integrity.
*/
@Test()
public void testHBaseWriterProcessorIntegrity() {
Assert.assertNotNull(hwproc);
Assert.assertEquals(hwproc.getURICount(), 0);
}
}