K
- Key typeV
- Value typepublic abstract class SequenceFileUtility<K extends org.apache.hadoop.io.Writable,V extends org.apache.hadoop.io.Writable> extends Object implements Iterable<Map.Entry<K,V>>
Modifier and Type | Class and Description |
---|---|
static class |
SequenceFileUtility.FilenameKeyProvider
A class that provides Text keys from the name of a file
|
static interface |
SequenceFileUtility.KeyProvider<K>
Interface for objects that can make a key from a path
|
static class |
SequenceFileUtility.MD5UUIDKeyProvider
A class that provides Text keys by calculating a UUID from the MD5 of a
file
|
static class |
SequenceFileUtility.RelativePathFilenameKeyProvider
A class that provides Text keys from the relative path + name of a file
|
Modifier and Type | Field and Description |
---|---|
protected org.apache.hadoop.io.SequenceFile.CompressionType |
compressionType |
protected org.apache.hadoop.conf.Configuration |
config |
protected org.apache.hadoop.fs.FileSystem |
fileSystem |
protected boolean |
isReader |
protected org.apache.hadoop.fs.Path |
sequenceFilePath |
protected String |
uuid |
protected org.apache.hadoop.io.SequenceFile.Writer |
writer |
Constructor and Description |
---|
SequenceFileUtility(String uriOrPath,
boolean read) |
SequenceFileUtility(String uriOrPath,
org.apache.hadoop.io.SequenceFile.CompressionType compressionType) |
SequenceFileUtility(String uriOrPath,
org.apache.hadoop.io.SequenceFile.CompressionType compressionType,
Map<String,String> metadata) |
SequenceFileUtility(URI uri,
boolean read) |
SequenceFileUtility(URI uri,
org.apache.hadoop.io.SequenceFile.CompressionType compressionType) |
SequenceFileUtility(URI uri,
org.apache.hadoop.io.SequenceFile.CompressionType compressionType,
Map<String,String> metadata) |
Modifier and Type | Method and Description |
---|---|
void |
appendData(K key,
V value)
Append data to a sequence file.
|
void |
appendFile(K key,
org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path p)
Append data read from a file to the sequence file.
|
Map<org.apache.hadoop.fs.Path,K> |
appendFiles(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path path,
boolean recurse,
org.apache.hadoop.fs.PathFilter pathFilter,
SequenceFileUtility.KeyProvider<K> keyProvider)
Append files to a sequenceFile.
|
void |
close()
Close the underlying writer.
|
static URI |
convertToURI(String uriOrPath)
Converts a string representing a file or uri to a uri object.
|
void |
exportData(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path dirPath)
Extracts file to a directory.
|
void |
exportData(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path dirPath,
NamingStrategy naming,
ExtractionState extrState,
boolean addExtension,
long offset)
Extracts file to a directory.
|
void |
exportData(NamingStrategy np,
ExtractionState nps,
long offset,
KeyValueDump<K,V> dump) |
void |
exportData(String uriOrPath)
Extracts file to a directory.
|
void |
exportData(String uriOrPath,
NamingStrategy naming,
ExtractionState extrState,
boolean addExtension,
long offset)
Extracts file to a directory.
|
void |
exportDataToZip(String uriOrPath,
NamingStrategy naming,
ExtractionState state,
boolean addExtension,
long offset)
Extracts file to a directory.
|
void |
exportDataToZip(ZipOutputStream zos,
NamingStrategy naming,
ExtractionState extrState,
boolean addExtension,
long offset)
Extracts file to a zip file.
|
void |
extract(List<RecordInformationExtractor> extractors,
PrintStream stream,
String delim)
Go through a sequence file, applying each
RecordInformationExtractor to each key, printing out the results
in order to the provided PrintStream |
V |
find(K queryKey)
Search for the record identified by queryKey.
|
V |
find(K queryKey,
long offset)
Search for the record identified by queryKey.
|
boolean |
findAndExport(K key,
org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path dirPath,
long offset)
Find a record and write the value to a file.
|
boolean |
findAndExport(K key,
String uriOrPath,
long offset)
Find a record and write the value to a file.
|
Class<? extends org.apache.hadoop.io.compress.CompressionCodec> |
getCompressionCodecClass() |
org.apache.hadoop.io.SequenceFile.CompressionType |
getCompressionType() |
static org.apache.hadoop.fs.Path[] |
getFilePaths(String[] uriOrPaths,
String filenamePrefix)
Get a list of all the sequence files (with a given name prefix) in the
set of input paths.
|
static org.apache.hadoop.fs.Path[] |
getFilePaths(String[] uriOrPaths,
String subdir,
String filenamePrefix)
Get a list of all the sequence files (with a given name prefix) in the
set of input paths.
|
static org.apache.hadoop.fs.Path[] |
getFilePaths(String uriOrPath,
String filenamePrefix)
Get a list of all the sequence files (with a given name prefix) in a
directory.
|
static URI[] |
getFiles(String uriOrPath,
String filenamePrefix)
Get a list of all the sequence files (with a given name prefix) in a
directory.
|
static URI[] |
getFilesRegex(String uriOrPath,
String regex)
Get a list of all the sequence files whose names match the given regular
expression in a directory.
|
org.apache.hadoop.fs.FileSystem |
getFileSystem(URI uri)
Get the filesystem associated with a uri.
|
static org.apache.hadoop.fs.FileSystem |
getFileSystem(URI uri,
org.apache.hadoop.conf.Configuration config)
Get the filesystem associated with a uri.
|
Map<org.apache.hadoop.io.Text,org.apache.hadoop.io.Text> |
getMetadata()
Return the metadata map.
|
long |
getNumberRecords()
Get number of records in file.
|
org.apache.hadoop.fs.Path |
getPath(URI uri)
Get a path from a uri.
|
static URI[] |
getReducerFiles(String uriOrPath)
Get a list of all the reducer outputs in a directory.
|
org.apache.hadoop.fs.Path |
getSequenceFilePath() |
String |
getUUID()
Get the UUID of this file
|
Iterator<Map.Entry<K,V>> |
iterator() |
List<K> |
listKeys()
Return a list of the keys in the sequence file.
|
Map<K,Long> |
listKeysAndOffsets()
Return a list of the keys in the sequence file.
|
static String |
md5sum(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path p)
Get the MD5 sum of a file
|
static ZipOutputStream |
openZipOutputStream(String uriOrPath) |
protected abstract void |
printFile(V value) |
protected abstract V |
readFile(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path path) |
protected abstract void |
writeFile(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path path,
V value) |
void |
writePathMap(Map<org.apache.hadoop.fs.Path,K> map) |
protected abstract void |
writeZipData(ZipOutputStream zos,
V value) |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
forEach, spliterator
protected org.apache.hadoop.conf.Configuration config
protected org.apache.hadoop.fs.FileSystem fileSystem
protected org.apache.hadoop.fs.Path sequenceFilePath
protected org.apache.hadoop.io.SequenceFile.Writer writer
protected org.apache.hadoop.io.SequenceFile.CompressionType compressionType
protected boolean isReader
public SequenceFileUtility(String uriOrPath, boolean read) throws IOException
IOException
public SequenceFileUtility(URI uri, boolean read) throws IOException
IOException
public SequenceFileUtility(String uriOrPath, org.apache.hadoop.io.SequenceFile.CompressionType compressionType) throws IOException
IOException
public SequenceFileUtility(URI uri, org.apache.hadoop.io.SequenceFile.CompressionType compressionType) throws IOException
IOException
public SequenceFileUtility(String uriOrPath, org.apache.hadoop.io.SequenceFile.CompressionType compressionType, Map<String,String> metadata) throws IOException
IOException
public SequenceFileUtility(URI uri, org.apache.hadoop.io.SequenceFile.CompressionType compressionType, Map<String,String> metadata) throws IOException
IOException
public static URI[] getReducerFiles(String uriOrPath) throws IOException
uriOrPath
- the path or uriIOException
public static URI[] getFiles(String uriOrPath, String filenamePrefix) throws IOException
uriOrPath
- the path or urifilenamePrefix
- the prefix of the file nameIOException
public static org.apache.hadoop.fs.Path[] getFilePaths(String[] uriOrPaths, String filenamePrefix) throws IOException
uriOrPaths
- the paths or urisfilenamePrefix
- the prefix of the file nameIOException
public static org.apache.hadoop.fs.Path[] getFilePaths(String[] uriOrPaths, String subdir, String filenamePrefix) throws IOException
Optionally a subdirectory can be provided; if provided the subdirectory is appended to each path (i.e. PATH/subdirectory).
If the given uri is not a directory, then it is assumed that it is a single SequenceFile and returned directly.
uriOrPaths
- the URI or path to the directory/filesubdir
- the optional subdirectory (may be null)filenamePrefix
- the prefix of the file nameIOException
public static org.apache.hadoop.fs.Path[] getFilePaths(String uriOrPath, String filenamePrefix) throws IOException
uriOrPath
- the path or urifilenamePrefix
- the prefix of the file nameIOException
public static URI[] getFilesRegex(String uriOrPath, String regex) throws IOException
uriOrPath
- the path or uriregex
- the regular expression to matchIOException
public Map<K,Long> listKeysAndOffsets()
public void extract(List<RecordInformationExtractor> extractors, PrintStream stream, String delim)
RecordInformationExtractor
to each key, printing out the results
in order to the provided PrintStream
extractors
- the RecordInformationExtractor
s to applystream
- the stream to write todelim
- public static URI convertToURI(String uriOrPath)
uriOrPath
- uri or path to convertpublic Map<org.apache.hadoop.io.Text,org.apache.hadoop.io.Text> getMetadata()
public List<K> listKeys()
public void exportData(String uriOrPath) throws IOException
uriOrPath
- path or uri to extract to.IOException
public void exportData(String uriOrPath, NamingStrategy naming, ExtractionState extrState, boolean addExtension, long offset) throws IOException
uriOrPath
- path or uri to extract to.naming
- the naming strategyextrState
- the extraction stateaddExtension
- if true, then file extensions are added to each record
automaticallyoffset
- offset from which to start. Can be used to reduce number of
files extracted.IOException
public static ZipOutputStream openZipOutputStream(String uriOrPath) throws IOException
IOException
public void exportDataToZip(String uriOrPath, NamingStrategy naming, ExtractionState state, boolean addExtension, long offset) throws IOException
uriOrPath
- path or uri to extract to.naming
- the naming strategystate
- the extraction stateaddExtension
- if true, then file extensions are added to each record
automaticallyoffset
- offset from which to start. Can be used to reduce number of
files extracted.IOException
public void exportDataToZip(ZipOutputStream zos, NamingStrategy naming, ExtractionState extrState, boolean addExtension, long offset) throws IOException
zos
- The ZipOutputStream
to write tonaming
- The naming strategyextrState
- The extration stateaddExtension
- if true, then file extensions are added to each record
automaticallyoffset
- offset from which to start. Can be used to reduce number of
files extracted.IOException
public void exportData(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path dirPath)
fs
- filesystem of output filedirPath
- path to extract topublic void exportData(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path dirPath, NamingStrategy naming, ExtractionState extrState, boolean addExtension, long offset)
fs
- filesystem of output filedirPath
- path to extract tonaming
- the naming strategyextrState
- the extraction stateaddExtension
- if true, then file extensions are added to each record
automaticallyoffset
- offset from which to start. Can be used to reduce number of
files extracted.public void exportData(NamingStrategy np, ExtractionState nps, long offset, KeyValueDump<K,V> dump)
public void close() throws IOException
IOException
public long getNumberRecords()
public Class<? extends org.apache.hadoop.io.compress.CompressionCodec> getCompressionCodecClass()
public org.apache.hadoop.io.SequenceFile.CompressionType getCompressionType()
public org.apache.hadoop.fs.FileSystem getFileSystem(URI uri) throws IOException
uri
- IOException
public static org.apache.hadoop.fs.FileSystem getFileSystem(URI uri, org.apache.hadoop.conf.Configuration config) throws IOException
uri
- config
- IOException
public org.apache.hadoop.fs.Path getPath(URI uri) throws IOException
uri
- IOException
public static String md5sum(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path p)
fs
- p
- protected abstract V readFile(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path path) throws IOException
IOException
protected abstract void writeFile(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path path, V value) throws IOException
IOException
protected abstract void writeZipData(ZipOutputStream zos, V value) throws IOException
IOException
protected abstract void printFile(V value) throws IOException
IOException
public void appendFile(K key, org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path p) throws IOException
key
- fs
- p
- IOException
public void appendData(K key, V value) throws IOException
key
- value
- IOException
public Map<org.apache.hadoop.fs.Path,K> appendFiles(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path path, boolean recurse, org.apache.hadoop.fs.PathFilter pathFilter, SequenceFileUtility.KeyProvider<K> keyProvider) throws IOException
fs
- The filesystem of the files being added.path
- The path of the file(s) being added.recurse
- If true, then subdirectories are also searchedpathFilter
- Filter for omitting files. Can be null.keyProvider
- Object that can return a key for a given file.IOException
public void writePathMap(Map<org.apache.hadoop.fs.Path,K> map) throws IOException
IOException
public V find(K queryKey, long offset)
queryKey
- the key.offset
- the offset from which to commence searchpublic V find(K queryKey)
queryKey
- public boolean findAndExport(K key, String uriOrPath, long offset) throws IOException
key
- uriOrPath
- offset
- IOException
public boolean findAndExport(K key, org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path dirPath, long offset) throws IOException
key
- fs
- dirPath
- offset
- IOException
public org.apache.hadoop.fs.Path getSequenceFilePath()