org.archive.modules.writer
Class HBaseWriterProcessor

java.lang.Object
  extended by org.archive.modules.Processor
      extended by org.archive.modules.writer.WriterPoolProcessor
          extended by org.archive.modules.writer.HBaseWriterProcessor
All Implemented Interfaces:
org.archive.checkpointing.Checkpointable, org.archive.io.warc.WARCWriterPoolSettings, org.archive.io.WriterPoolSettings, org.archive.spring.HasKeyedProperties, org.springframework.beans.factory.BeanNameAware, org.springframework.context.Lifecycle

public class HBaseWriterProcessor
extends org.archive.modules.writer.WriterPoolProcessor
implements org.archive.io.warc.WARCWriterPoolSettings

A Heritrix 3 processor that writes to Hadoop HBase. The following example shows how to configure the crawl job configuration.

 <!-- DISPOSITION CHAIN -->
 <bean id="hbaseParameterSettings" class="org.archive.io.hbase.HBaseParameters">
   <property name="contentColumnFamily" value="newcontent" />
   <!-- Overwrite more options here -->
 </bean>

 <bean id="hbaseWriterProcessor" class="org.archive.modules.writer.HBaseWriterProcessor">
   <property name="zkQuorum" value="localhost" />
   <property name="zkClientPort" value="2181" />
   <property name="hbaseTable" value="crawl" />
   <property name="hbaseParameters">
     <ref bean="hbaseParameterSettings" />
   </property>
 </bean>

 <bean id="dispositionProcessors" class="org.archive.modules.DispositionChain">
   <property name="processors">
     <list>
     <!-- write to aggregate archival files... -->
     <ref bean="hbaseWriterProcessor"/>
     <!-- other references -->
     </list>
   </property>
 </bean>
 
 

See Also:
{@link org.archive.io.hbase.HBaseParameters} for defining hbaseParameters

Field Summary
 
Fields inherited from class org.archive.modules.writer.WriterPoolProcessor
ANNOTATION_UNWRITTEN, directory, frequentFlushes, serverCache, writeBufferSize
 
Fields inherited from class org.archive.modules.Processor
kp, recoveryCheckpoint, uriCount
 
Constructor Summary
HBaseWriterProcessor()
           
 
Method Summary
 HBaseParameters getHbaseParameters()
           
 String getHbaseTable()
           
 List<String> getMetadata()
           
 org.archive.uid.RecordIDGenerator getRecordIDGenerator()
           
 int getZkClientPort()
           
 String getZkQuorum()
          Getters and setters
protected  org.archive.modules.ProcessResult innerProcessResult(org.archive.modules.CrawlURI uri)
           
 boolean onlyProcessNewRecords()
           
 boolean onlyWriteNewRecords()
           
 void setHbaseParameters(HBaseParameters options)
           
 void setHbaseTable(String hbaseTable)
           
 void setOnlyProcessNewRecords(boolean onlyProcessNewRecords)
           
 void setOnlyWriteNewRecords(boolean onlyWriteNewRecords)
           
protected  void setupPool(AtomicInteger serial)
           
 void setZkClientPort(int zkClientPort)
           
 void setZkQuorum(String zkQuorum)
           
protected  boolean shouldProcess(org.archive.modules.CrawlURI curi)
           
protected  boolean shouldWrite(org.archive.modules.CrawlURI curi)
          Whether the given CrawlURI should be written to archive files.
protected  org.archive.modules.ProcessResult write(org.archive.modules.CrawlURI curi, long recordLength, InputStream in)
          Write to HBase.
 
Methods inherited from class org.archive.modules.writer.WriterPoolProcessor
calcOutputDirs, checkBytesWritten, copyForwardWriteTagIfDupe, doCheckpoint, fromCheckpointJson, getCompress, getDirectory, getFrequentFlushes, getHostAddress, getMaxFileSizeBytes, getMaxTotalBytesToWrite, getMaxWaitForIdleMs, getMetadataProvider, getPool, getPoolMaxActive, getPrefix, getSerialNo, getServerCache, getSkipIdenticalDigests, getStorePaths, getTemplate, getTotalBytesWritten, getWriteBufferSize, innerProcess, innerRejectProcess, setCompress, setDirectory, setFrequentFlushes, setMaxFileSizeBytes, setMaxTotalBytesToWrite, setMaxWaitForIdleMs, setMetadataProvider, setPool, setPoolMaxActive, setPrefix, setServerCache, setSkipIdenticalDigests, setStorePaths, setTemplate, setTotalBytesWritten, setWriteBufferSize, start, stop, toCheckpointJson
 
Methods inherited from class org.archive.modules.Processor
finishCheckpoint, flattenVia, getBeanName, getEnabled, getKeyedProperties, getRecordedSize, getShouldProcessRule, getURICount, hasHttpAuthenticationCredential, isRunning, isSuccess, process, report, setBeanName, setEnabled, setRecoveryCheckpoint, setShouldProcessRule, startCheckpoint
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 
Methods inherited from interface org.archive.io.WriterPoolSettings
calcOutputDirs, getCompress, getFrequentFlushes, getMaxFileSizeBytes, getPrefix, getTemplate, getWriteBufferSize
 
Methods inherited from interface org.springframework.context.Lifecycle
isRunning
 
Methods inherited from interface org.archive.checkpointing.Checkpointable
finishCheckpoint, setRecoveryCheckpoint, startCheckpoint
 

Constructor Detail

HBaseWriterProcessor

public HBaseWriterProcessor()
Method Detail

getZkQuorum

public String getZkQuorum()
Getters and setters


setZkQuorum

public void setZkQuorum(String zkQuorum)

getZkClientPort

public int getZkClientPort()

setZkClientPort

public void setZkClientPort(int zkClientPort)

getHbaseTable

public String getHbaseTable()

setHbaseTable

public void setHbaseTable(String hbaseTable)

getHbaseParameters

public HBaseParameters getHbaseParameters()

setHbaseParameters

public void setHbaseParameters(HBaseParameters options)

onlyWriteNewRecords

public boolean onlyWriteNewRecords()

setOnlyWriteNewRecords

public void setOnlyWriteNewRecords(boolean onlyWriteNewRecords)

onlyProcessNewRecords

public boolean onlyProcessNewRecords()

setOnlyProcessNewRecords

public void setOnlyProcessNewRecords(boolean onlyProcessNewRecords)

setupPool

protected void setupPool(AtomicInteger serial)
Specified by:
setupPool in class org.archive.modules.writer.WriterPoolProcessor

innerProcessResult

protected org.archive.modules.ProcessResult innerProcessResult(org.archive.modules.CrawlURI uri)
Specified by:
innerProcessResult in class org.archive.modules.writer.WriterPoolProcessor

shouldProcess

protected boolean shouldProcess(org.archive.modules.CrawlURI curi)
Overrides:
shouldProcess in class org.archive.modules.writer.WriterPoolProcessor

shouldWrite

protected boolean shouldWrite(org.archive.modules.CrawlURI curi)
Whether the given CrawlURI should be written to archive files. Annotates CrawlURI with a reason for any negative answer.

Overrides:
shouldWrite in class org.archive.modules.writer.WriterPoolProcessor
Parameters:
curi - CrawlURI
Returns:
true if URI should be written; false otherwise

write

protected org.archive.modules.ProcessResult write(org.archive.modules.CrawlURI curi,
                                                  long recordLength,
                                                  InputStream in)
                                           throws IOException
Write to HBase.

Parameters:
curi - the curi
recordLength - the record length
in - the in
Returns:
the process result
Throws:
IOException - Signals that an I/O exception has occurred.

getMetadata

public List<String> getMetadata()
Specified by:
getMetadata in interface org.archive.io.WriterPoolSettings
Specified by:
getMetadata in class org.archive.modules.writer.WriterPoolProcessor

getRecordIDGenerator

public org.archive.uid.RecordIDGenerator getRecordIDGenerator()
Specified by:
getRecordIDGenerator in interface org.archive.io.warc.WARCWriterPoolSettings


Copyright © 2007-2011. All Rights Reserved.