public class HBaseWriterProcessor
extends org.archive.modules.writer.WriterPoolProcessor
implements org.archive.io.warc.WARCWriterPoolSettings
<bean id="hbaseParameterSettings" class="org.archive.io.hbase.HBaseParameters">
<!-- These settings are required -->
<property name="zkQuorum" value="localhost" />
<property name="hbaseTableName" value="crawl" />
<!-- This should reflect your installation, but 2181 is the default -->
<property name="zkPort" value="2181" />
<!-- All other settings are optional -->
<property name="onlyProcessNewRecords" value="false" />
<property name="onlyWriteNewRecords" value="false" />
<property name="contentColumnFamily" value="newcontent" />
<property name="defaultMaxFileSizeInBytes" value="26214400" />
<!-- 25 * 1024 * 1024 = 26214400 bytes -->
<!-- Overwrite more options here -->
</bean>
<bean id="hbaseWriterProcessor" class="org.archive.modules.writer.HBaseWriterProcessor">
<property name="hbaseParameters">
<ref bean="hbaseParameterSettings"/>
</property>
</bean>
<bean id="dispositionProcessors" class="org.archive.modules.DispositionChain">
<property name="processors">
<list>
<ref bean="hbaseWriterProcessor"/>
<!-- other references -->
</list>
</property>
</bean>
| Modifier and Type | Field and Description |
|---|---|
static long |
serialVersionUID
The Constant serialVersionUID.
|
ANNOTATION_UNWRITTEN, compress, directory, frequentFlushes, maxFileSizeBytes, maxTotalBytesToWrite, maxWaitForIdleMs, poolMaxActive, prefix, serverCache, skipIdenticalDigests, startNewFilesOnCheckpoint, storePaths, template, writeBufferSize| Constructor and Description |
|---|
HBaseWriterProcessor() |
| Modifier and Type | Method and Description |
|---|---|
protected org.archive.io.WriterPool |
generateWriterPool(AtomicInteger serial) |
long |
getDefaultMaxFileSize()
Gets the default max file size.
|
List<org.archive.spring.ConfigPath> |
getDefaultStorePaths()
Gets the default store paths.
|
HBaseParameters |
getHbaseParameters()
Gets the hbase parameters.
|
List<String> |
getMetadata() |
org.archive.uid.RecordIDGenerator |
getRecordIDGenerator() |
protected org.archive.modules.ProcessResult |
innerProcessResult(org.archive.modules.CrawlURI uri) |
void |
setHbaseParameters(HBaseParameters options)
Sets the hbase parameters.
|
protected void |
setupPool(AtomicInteger serial) |
protected boolean |
shouldProcess(org.archive.modules.CrawlURI curi) |
protected boolean |
shouldWrite(org.archive.modules.CrawlURI curi)
Whether the given CrawlURI should be written to archive files.
|
protected org.archive.modules.ProcessResult |
write(org.archive.modules.CrawlURI curi,
long recordLength,
boolean doNotWriteContent)
Write to HBase.
|
calcOutputDirs, checkBytesWritten, copyForwardWriteTagIfDupe, doCheckpoint, fromCheckpointJson, getCompress, getDirectory, getFrequentFlushes, getHostAddress, getMaxFileSizeBytes, getMaxTotalBytesToWrite, getMaxWaitForIdleMs, getMetadataProvider, getPool, getPoolMaxActive, getPrefix, getSerialNo, getServerCache, getSkipIdenticalDigests, getStartNewFilesOnCheckpoint, getStorePaths, getTemplate, getTotalBytesWritten, getWriteBufferSize, innerProcess, innerRejectProcess, setCompress, setDirectory, setFrequentFlushes, setMaxFileSizeBytes, setMaxTotalBytesToWrite, setMaxWaitForIdleMs, setMetadataProvider, setPool, setPoolMaxActive, setPrefix, setServerCache, setSkipIdenticalDigests, setStartNewFilesOnCheckpoint, setStorePaths, setTemplate, setTotalBytesWritten, setWriteBufferSize, start, stop, toCheckpointJsonfinishCheckpoint, flattenVia, getBeanName, getEnabled, getKeyedProperties, getRecordedSize, getShouldProcessRule, getURICount, hasHttpAuthenticationCredential, isRunning, isSuccess, process, report, setBeanName, setEnabled, setRecoveryCheckpoint, setShouldProcessRule, startCheckpointclone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, waitpublic static final long serialVersionUID
public HBaseParameters getHbaseParameters()
public void setHbaseParameters(HBaseParameters options)
options - the new hbase parameterspublic long getDefaultMaxFileSize()
getDefaultMaxFileSize in class org.archive.modules.writer.WriterPoolProcessorprotected void setupPool(AtomicInteger serial)
setupPool in class org.archive.modules.writer.WriterPoolProcessorprotected org.archive.io.WriterPool generateWriterPool(AtomicInteger serial)
protected org.archive.modules.ProcessResult innerProcessResult(org.archive.modules.CrawlURI uri)
innerProcessResult in class org.archive.modules.writer.WriterPoolProcessorprotected boolean shouldProcess(org.archive.modules.CrawlURI curi)
shouldProcess in class org.archive.modules.writer.WriterPoolProcessorprotected boolean shouldWrite(org.archive.modules.CrawlURI curi)
shouldWrite in class org.archive.modules.writer.WriterPoolProcessorcuri - CrawlURIprotected org.archive.modules.ProcessResult write(org.archive.modules.CrawlURI curi,
long recordLength,
boolean doNotWriteContent)
throws IOException
curi - the curirecordLength - the record lengthin - the inIOException - Signals that an I/O exception has occurred.public List<org.archive.spring.ConfigPath> getDefaultStorePaths()
getDefaultStorePaths in class org.archive.modules.writer.WriterPoolProcessorpublic List<String> getMetadata()
getMetadata in interface org.archive.io.WriterPoolSettingsgetMetadata in class org.archive.modules.writer.WriterPoolProcessorpublic org.archive.uid.RecordIDGenerator getRecordIDGenerator()
getRecordIDGenerator in interface org.archive.io.warc.WARCWriterPoolSettingsCopyright © 2007–2014. All rights reserved.