forked from opaque-systems/sequencefile
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 74b758b
Showing
36 changed files
with
2,382 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
dist/ | ||
Hadoop.egg-info/ | ||
__pycache__ | ||
build/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
Pure Python SequenceFile Reader and Writer implementation | ||
that allows you to read and write your Hadoop sequence files | ||
without using java. | ||
|
||
Author: Matteo Bertozzi <[email protected]> | ||
|
||
Contributors: | ||
|
||
* Brian Bloniarz <[email protected]> | ||
* Alex Roper <[email protected]> | ||
* Jeremy G. Kahn <[email protected]> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#!/usr/bin/env python | ||
# ======================================================================== | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from hadoop.io.IntWritable import LongWritable, IntWritable | ||
from hadoop.io import ArrayFile | ||
|
||
if __name__ == '__main__': | ||
writer = ArrayFile.Writer('array-test', IntWritable) | ||
writer.INDEX_INTERVAL = 16 | ||
for i in xrange(0, 100): | ||
writer.append(IntWritable(1 + i * 10)) | ||
writer.close() | ||
|
||
key = LongWritable() | ||
value = IntWritable() | ||
reader = ArrayFile.Reader('array-test') | ||
while reader.next(key, value): | ||
print key, value | ||
|
||
print 'GET 8' | ||
print reader.get(8, value) | ||
print value | ||
|
||
print 'GET 110' | ||
print reader.get(110, value) | ||
|
||
print 'GET 25' | ||
print reader.get(25, value) | ||
print value | ||
|
||
print 'GET 55' | ||
print reader.get(55, value) | ||
print value | ||
|
||
reader.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#!/usr/bin/env python | ||
# ======================================================================== | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from hadoop.io.IntWritable import LongWritable | ||
from hadoop.io import MapFile | ||
|
||
if __name__ == '__main__': | ||
writer = MapFile.Writer('map-test', LongWritable, LongWritable) | ||
writer.INDEX_INTERVAL = 2 | ||
for i in xrange(0, 100, 2): | ||
writer.append(LongWritable(i), LongWritable(i * 10)) | ||
writer.close() | ||
|
||
key = LongWritable() | ||
value = LongWritable() | ||
reader = MapFile.Reader('map-test') | ||
while reader.next(key, value): | ||
print key, value | ||
|
||
print 'MID KEY', reader.midKey() | ||
print 'FINAL KEY', reader.finalKey(key), key | ||
|
||
print 'GET CLOSEST' | ||
key.set(8) | ||
print reader.get(key, value) | ||
print value | ||
|
||
print 'GET 111' | ||
key.set(111) | ||
print reader.get(key, value) | ||
|
||
key.set(25) | ||
print 'SEEK 25 before' | ||
print reader.getClosest(key, value, before=True) | ||
print value | ||
|
||
key.set(55) | ||
print 'SEEK 55' | ||
print reader.getClosest(key, value) | ||
print value | ||
|
||
reader.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#!/usr/bin/env python | ||
# ======================================================================== | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from hadoop.io.SequenceFile import CompressionType | ||
from hadoop.io.SequenceFile import Metadata | ||
from hadoop.io import LongWritable | ||
from hadoop.io import SequenceFile | ||
|
||
def writeData(writer): | ||
key = LongWritable() | ||
value = LongWritable() | ||
|
||
for i in xrange(10): | ||
key.set(1000 - i) | ||
value.set(i) | ||
print '[%d] %s %s' % (writer.getLength(), key.toString(), value.toString()) | ||
writer.append(key, value) | ||
|
||
def testWrite(filename): | ||
metadata = Metadata() | ||
metadata.set('Meta Key 0', 'Meta Value 0') | ||
metadata.set('Meta Key 1', 'Meta Value 1') | ||
|
||
writer = SequenceFile.createWriter(filename, LongWritable, LongWritable, metadata) | ||
writeData(writer) | ||
writer.close() | ||
|
||
def testRead(filename): | ||
reader = SequenceFile.Reader(filename) | ||
|
||
metadata = reader.getMetadata() | ||
for meta_key, meta_value in metadata: | ||
print 'METADATA:', meta_key, meta_value | ||
|
||
key_class = reader.getKeyClass() | ||
value_class = reader.getValueClass() | ||
|
||
key = key_class() | ||
value = value_class() | ||
|
||
position = reader.getPosition() | ||
while reader.next(key, value): | ||
print '*' if reader.syncSeen() else ' ', | ||
print '[%6s] %6s %6s' % (position, key.toString(), value.toString()) | ||
position = reader.getPosition() | ||
|
||
reader.close() | ||
|
||
if __name__ == '__main__': | ||
filename = 'test-meta.seq' | ||
testWrite(filename) | ||
testRead(filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#!/usr/bin/env python | ||
# ======================================================================== | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import sys | ||
|
||
from hadoop.io import SequenceFile | ||
|
||
if __name__ == '__main__': | ||
if len(sys.argv) < 2: | ||
print 'usage: SequenceFileReader <filename>' | ||
else: | ||
reader = SequenceFile.Reader(sys.argv[1]) | ||
|
||
key_class = reader.getKeyClass() | ||
value_class = reader.getValueClass() | ||
|
||
key = key_class() | ||
value = value_class() | ||
|
||
#reader.sync(4042) | ||
position = reader.getPosition() | ||
while reader.next(key, value): | ||
print '*' if reader.syncSeen() else ' ', | ||
print '[%6s] %6s %6s' % (position, key.toString(), value.toString()) | ||
position = reader.getPosition() | ||
|
||
reader.close() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/usr/bin/env python | ||
# ======================================================================== | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from hadoop.io.SequenceFile import CompressionType | ||
from hadoop.io import LongWritable | ||
from hadoop.io import SequenceFile | ||
|
||
def writeData(writer): | ||
key = LongWritable() | ||
value = LongWritable() | ||
|
||
for i in xrange(1000): | ||
key.set(1000 - i) | ||
value.set(i) | ||
print '[%d] %s %s' % (writer.getLength(), key.toString(), value.toString()) | ||
writer.append(key, value) | ||
|
||
if __name__ == '__main__': | ||
writer = SequenceFile.createWriter('test.seq', LongWritable, LongWritable) | ||
writeData(writer) | ||
writer.close() | ||
|
||
writer = SequenceFile.createWriter('test-record.seq', LongWritable, LongWritable, compression_type=CompressionType.RECORD) | ||
writeData(writer) | ||
writer.close() | ||
|
||
writer = SequenceFile.createWriter('test-block.seq', LongWritable, LongWritable, compression_type=CompressionType.BLOCK) | ||
writeData(writer) | ||
writer.close() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#!/usr/bin/env python | ||
# ======================================================================== | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from hadoop.io.IntWritable import IntWritable | ||
from hadoop.io import SetFile | ||
|
||
if __name__ == '__main__': | ||
writer = SetFile.Writer('set-test', IntWritable) | ||
writer.INDEX_INTERVAL = 16 | ||
for i in xrange(0, 100, 2): | ||
writer.append(IntWritable(i * 10)) | ||
writer.close() | ||
|
||
key = IntWritable() | ||
reader = SetFile.Reader('set-test') | ||
while reader.next(key): | ||
print key | ||
|
||
print 'GET 8' | ||
key.set(8) | ||
print reader.get(key) | ||
|
||
print 'GET 120' | ||
key.set(120) | ||
print reader.get(key) | ||
|
||
print 'GET 240' | ||
key.set(240) | ||
print reader.get(key) | ||
|
||
print 'GET 550' | ||
key.set(550) | ||
print reader.get(key) | ||
|
||
reader.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#!/usr/bin/env python | ||
# ======================================================================== | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from hadoop.io.SequenceFile import CompressionType | ||
from hadoop.io import Text | ||
from hadoop.io import SequenceFile | ||
|
||
def writeData(writer): | ||
key = Text() | ||
value = Text() | ||
|
||
key.set('Key') | ||
value.set('Value') | ||
|
||
writer.append(key, value) | ||
|
||
if __name__ == '__main__': | ||
writer = SequenceFile.createWriter('test.seq', Text, Text) | ||
writeData(writer) | ||
writer.close() |
Oops, something went wrong.