The following document contains the results of RAT (Release Audit Tool) .
*****************************************************
Summary
-------
Notes: 3
Binaries: 0
Archives: 0
Standards: 12
Apache Licensed: 0
Generated Documents: 0
JavaDocs are generated and so license header is optional
Generated files do not required license headers
12 Unknown Licenses
*******************************
Archives (+ indicates readable, $ unreadable):
*****************************************************
Files with AL headers will be marked L
Binary files (which do not require AL headers) will be marked B
Compressed archives will be marked A
Notices, licenses etc will be marked N
!????? pom.xml
N CHANGELOG.txt
!????? src/conf/modules/Processor.options
!????? src/main/java/com/powerset/heritrix/writer/HBaseWriterPool.java
!????? src/main/java/com/powerset/heritrix/writer/HBaseWriterProcessor.java
!????? src/main/java/com/powerset/heritrix/writer/package.html
!????? src/main/java/com/powerset/heritrix/writer/HBaseWriter.java
!????? src/main/resources/com/powerset/heritrix/writer/HBaseWriterProcessor_en.utf8
!????? src/main/resources/checkstyle.xml
!????? src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriter.java
!????? src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterPool.java
!????? src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterProcessor.java
!????? .checkstyle
N README.txt
N LICENSE.txt
*****************************************************
Printing headers for files without AL header...
=======================================================================
==pom.xml
=======================================================================
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.powerset.heritrix</groupId>
<artifactId>hbase-writer</artifactId>
<packaging>jar</packaging>
<version>0.19.2-SNAPSHOT</version>
<name>HBase Writer</name>
<description>A library for writing Heritrix output directly to HBase tables.</description>
<url>http://code.google.com/p/hbase-writer</url>
<inceptionYear>2007</inceptionYear>
<issueManagement>
<system>GoogleCode Issues</system>
<url>http://code.google.com/p/hbase-writer/issues/list</url>
</issueManagement>
<mailingLists>
<mailingList>
<name>User List</name>
<subscribe>hbase-user-subscribe@hadoop.apache.org</subscribe>
<unsubscribe>hbase-user-unsubscribe@hadoop.apache.org</unsubscribe>
<post>hbase-user@hadoop.apache.org</post>
<archive>http://mail-archives.apache.org/mod_mbox/hadoop-hbase-user/</archive>
</mailingList>
<mailingList>
<name>Dev List</name>
<subscribe>hbase-dev-subscribe@hadoop.apache.org</subscribe>
<unsubscribe>hbase-dev-unsubscribe@hadoop.apache.org</unsubscribe>
<post>hbase-dev@hadoop.apache.org</post>
<archive>http://mail-archives.apache.org/mod_mbox/hadoop-hbase-dev/</archive>
</mailingList>
<mailingList>
<name>Commits List</name>
<subscribe>hbase-commits-subscribe@hadoop.apache.org</subscribe>
<unsubscribe>hbase-commits-unsubscribe@hadoop.apache.org</unsubscribe>
<post>hbase-commits@hadoop.apache.org</post>
<archive>http://mail-archives.apache.org/mod_mbox/hadoop-hbase-commits/</archive>
</mailingList>
</mailingLists>
<licenses>
<license>
<name>LGPL</name>
<url>http://www.gnu.org/copyleft/lesser.html</url>
=======================================================================
==src/conf/modules/Processor.options
=======================================================================
# Availible processors.
# Each processor class should be listed with full package info
# followed by a '|' and a descriptive name (containing only [a-z,A-z])
# Lines beginning with # and empty lines are ignored
com.powerset.heritrix.writer.HBaseWriterProcessor|HBaseArchiver
=======================================================================
==src/main/java/com/powerset/heritrix/writer/HBaseWriterPool.java
=======================================================================
/** HBaseWriterPool
*
* $Id$
*
* Created on June 23rd, 2007
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.powerset.heritrix.writer;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.pool.BasePoolableObjectFactory;
import org.archive.io.DefaultWriterPoolSettings;
import org.archive.io.WriterPool;
import org.archive.io.WriterPoolMember;
// TODO: Auto-generated Javadoc
/**
* A pool of HBaseWriters.
*/
public class HBaseWriterPool extends WriterPool {
/**
* Constructor.
*
* @param poolMaximumActive the pool maximum active
* @param poolMaximumWait the pool maximum wait
* @param master the master
* @param table the table
*/
public HBaseWriterPool(final String master, final String table, final int poolMaximumActive, final int poolMaximumWait) {
// Below is hard to follow. Its invocation of this classes super
// constructor passing a serial, an instance of BasePoolable.. that
// is defined in line, followed by settings, max and wait.
super(new AtomicInteger(), new BasePoolableObjectFactory() {
=======================================================================
==src/main/java/com/powerset/heritrix/writer/HBaseWriterProcessor.java
=======================================================================
/**
* HBaseWriterProcessor
*
* $Id$
*
* Created on June 23rd, 2007
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.powerset.heritrix.writer;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.util.Keying;
import org.apache.log4j.Logger;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPool;
import org.archive.io.WriterPoolMember;
import org.archive.modules.ModuleAttributeConstants;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.archive.modules.ProcessorURI;
import org.archive.modules.fetcher.FetchStatusCodes;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.archive.modules.net.ServerCacheUtil;
import org.archive.state.Expert;
import org.archive.state.Immutable;
import org.archive.state.Initializable;
import org.archive.state.Key;
import org.archive.state.KeyManager;
=======================================================================
==src/main/java/com/powerset/heritrix/writer/package.html
=======================================================================
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head />
<body bgcolor="white">
Provides <a href="http://hbase.org">HBase</a> writer for
<a href="http://crawler.archive.org">heritrix</a>.
<h2>Requirements</h2>
TODO
</body>
</html>
=======================================================================
==src/main/java/com/powerset/heritrix/writer/HBaseWriter.java
=======================================================================
/**
* HBaseWriter
*
* $Id$
*
* Created on June 23rd, 2007
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.powerset.heritrix.writer;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.io.BatchUpdate;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Keying;
import org.apache.log4j.Logger;
import org.archive.io.ArchiveFileConstants;
import org.archive.io.RecordingInputStream;
import org.archive.io.RecordingOutputStream;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPoolMember;
import org.archive.modules.ProcessorURI;
// TODO: Auto-generated Javadoc
/**
* Write to HBase. Puts content into the 'content:' column and all else into the
* 'curi:' column family. Makes a row key of an url transformation. Creates
* table if it does not exist.
=======================================================================
==src/main/resources/com/powerset/heritrix/writer/HBaseWriterProcessor_en.utf8
=======================================================================
description:
HBaseWriter processor.
server-cache-description:
The server cache used to resolve IP addresses.
pool-max-active-description:
Maximum active files in pool. This setting cannot be varied over the life
of a crawl.
pool-max-wait-description:
Maximum time to wait on pool element (milliseconds). This setting cannot
be varied over the life of a crawl.
total-bytes-to-write-description:
Total file bytes to write to disk. Once the size of all files on disk has
exceeded this limit, this processor will stop the crawler. A value of
zero means no upper limit.
table-description:
Name of the HBase table to write crawl to
master-description:
Master host and port: e.g. localhost:60010
=======================================================================
==src/main/resources/checkstyle.xml
=======================================================================
<?xml version="1.0"?>
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.2//EN" "http://www.puppycrawl.com/dtds/configuration_1_2.dtd">
<!--
Checkstyle configuration that checks the sun coding conventions from:
- the Java Language Specification at
http://java.sun.com/docs/books/jls/second_edition/html/index.html
- the Sun Code Conventions at http://java.sun.com/docs/codeconv/
- the Javadoc guidelines at
http://java.sun.com/j2se/javadoc/writingdoccomments/index.html
- the JDK Api documentation http://java.sun.com/j2se/docs/api/index.html
- some best practices
Checkstyle is very configurable. Be sure to read the documentation at
http://checkstyle.sf.net (or in your downloaded distribution).
Most Checks are configurable, be sure to consult the documentation.
To completely disable a check, just comment it out or delete it from the file.
Finally, it is worth reading the documentation.
-->
<module name="Checker">
<module name="TreeWalker">
<!-- Checks for imports -->
<!-- See http://checkstyle.sf.net/config_import.html -->
<module name="AvoidStarImport"/>
<module name="IllegalImport"/> <!-- defaults to sun.* packages -->
<module name="RedundantImport"/>
<module name="UnusedImports"/>
<!-- Checks for Size Violations. -->
<!-- See http://checkstyle.sf.net/config_sizes.html -->
<!-- Modifier Checks -->
<!-- See http://checkstyle.sf.net/config_modifiers.html -->
<!-- Checks for blocks. You know, those {}'s -->
=======================================================================
==src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriter.java
=======================================================================
package com.powerset.heritrix.writer.test;
import java.io.IOException;
import org.testng.Assert;
import org.testng.annotations.Test;
import com.powerset.heritrix.writer.HBaseWriter;
// TODO: Auto-generated Javadoc
/**
* The Class TestHBaseWriter.
*/
public class TestHBaseWriter {
/** The master. */
String master = "localhost:60000";
/** The table. */
String table = "test";
/** The pool maximum active. */
int poolMaximumActive = 10;
/** The pool maximum wait. */
int poolMaximumWait = 20;
/** The hw. */
HBaseWriter hw;
/**
* Test that bad table values cannot be used when creating an instance of
* HbaseWriter.
*
* @throws IOException Signals that an I/O exception has occurred.
*/
@Test()
public void testCreateHBaseWriter() throws IOException {
// Test
try {
hw = new HBaseWriter(master, null);
Assert.assertNull(hw);
} catch (IllegalArgumentException e) {
Assert.assertNotNull(e);
}
try {
hw = new HBaseWriter(master, "");
Assert.assertNull(hw);
} catch (IllegalArgumentException e) {
=======================================================================
==src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterPool.java
=======================================================================
package com.powerset.heritrix.writer.test;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import com.powerset.heritrix.writer.HBaseWriterPool;
// TODO: Auto-generated Javadoc
/**
* The Class TestHBaseWriterPool.
*/
public class TestHBaseWriterPool {
/** The master. */
String master = "locahost";
/** The table. */
String table = "test";
/** The pool maximum active. */
int poolMaximumActive = 10;
/** The pool maximum wait. */
int poolMaximumWait = 20;
/** The hwp. */
HBaseWriterPool hwp;
/**
* Creates the h base writer pool.
*/
@BeforeClass()
public void createHBaseWriterPool() {
hwp = new HBaseWriterPool(master, table, poolMaximumActive,
poolMaximumWait);
}
/**
* Test h base writer pool integrity.
*/
@Test()
public void testHBaseWriterPoolIntegrity() {
Assert.assertNotNull(hwp);
Assert.assertEquals(hwp.getNumActive(), 0);
Assert.assertEquals(hwp.getNumIdle(), 0);
Assert.assertEquals(hwp.getSerialNo().intValue(), 0);
Assert.assertFalse(hwp.getSettings().isCompressed());
Assert.assertNull(hwp.getSettings().getPrefix());
Assert.assertNull(hwp.getSettings().getSuffix());
=======================================================================
==src/test/java/com/powerset/heritrix/writer/test/TestHBaseWriterProcessor.java
=======================================================================
package com.powerset.heritrix.writer.test;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import com.powerset.heritrix.writer.HBaseWriterProcessor;
// TODO: Auto-generated Javadoc
/**
* The Class TestHBaseWriterProcessor.
*
* @author rsmith
*
* TODO: mock objects should be used here to test the api integrity.
*/
public class TestHBaseWriterProcessor {
/** The hwproc. */
HBaseWriterProcessor hwproc;
/**
* Creates the h base writer processor.
*/
@BeforeClass()
public void createHBaseWriterProcessor() {
hwproc = new HBaseWriterProcessor();
}
/**
* Test h base writer processor integrity.
*/
@Test()
public void testHBaseWriterProcessorIntegrity() {
Assert.assertNotNull(hwproc);
Assert.assertEquals(hwproc.getURICount(), 0);
}
}
=======================================================================
==.checkstyle
=======================================================================
<?xml version="1.0" encoding="UTF-8"?>
<fileset-config file-format-version="1.2.0" simple-config="true">
<local-check-config name="HBase-Writer checkstyle" location="src/main/resources/checkstyle.xml" type="project" description="HBase-Writer checkstyle">
<additional-data name="protect-config-file" value="false"/>
</local-check-config>
<fileset name="all" enabled="true" check-config-name="HBase-Writer checkstyle" local="true">
<file-match-pattern match-pattern="." include-pattern="true"/>
</fileset>
</fileset-config>