Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
chester-leung committed Feb 24, 2021
0 parents commit 74b758b
Show file tree
Hide file tree
Showing 36 changed files with 2,382 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
dist/
Hadoop.egg-info/
__pycache__
build/
11 changes: 11 additions & 0 deletions README
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Pure Python SequenceFile Reader and Writer implementation
that allows you to read and write your Hadoop sequence files
without using java.

Author: Matteo Bertozzi <[email protected]>

Contributors:

* Brian Bloniarz <[email protected]>
* Alex Roper <[email protected]>
* Jeremy G. Kahn <[email protected]>
54 changes: 54 additions & 0 deletions examples/ArrayFileTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env python
# ========================================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from hadoop.io.IntWritable import LongWritable, IntWritable
from hadoop.io import ArrayFile

if __name__ == '__main__':
writer = ArrayFile.Writer('array-test', IntWritable)
writer.INDEX_INTERVAL = 16
for i in xrange(0, 100):
writer.append(IntWritable(1 + i * 10))
writer.close()

key = LongWritable()
value = IntWritable()
reader = ArrayFile.Reader('array-test')
while reader.next(key, value):
print key, value

print 'GET 8'
print reader.get(8, value)
print value
print

print 'GET 110'
print reader.get(110, value)
print

print 'GET 25'
print reader.get(25, value)
print value
print

print 'GET 55'
print reader.get(55, value)
print value
print

reader.close()
61 changes: 61 additions & 0 deletions examples/MapFileTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env python
# ========================================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from hadoop.io.IntWritable import LongWritable
from hadoop.io import MapFile

if __name__ == '__main__':
writer = MapFile.Writer('map-test', LongWritable, LongWritable)
writer.INDEX_INTERVAL = 2
for i in xrange(0, 100, 2):
writer.append(LongWritable(i), LongWritable(i * 10))
writer.close()

key = LongWritable()
value = LongWritable()
reader = MapFile.Reader('map-test')
while reader.next(key, value):
print key, value

print 'MID KEY', reader.midKey()
print 'FINAL KEY', reader.finalKey(key), key

print 'GET CLOSEST'
key.set(8)
print reader.get(key, value)
print value
print

print 'GET 111'
key.set(111)
print reader.get(key, value)
print

key.set(25)
print 'SEEK 25 before'
print reader.getClosest(key, value, before=True)
print value
print

key.set(55)
print 'SEEK 55'
print reader.getClosest(key, value)
print value
print

reader.close()
67 changes: 67 additions & 0 deletions examples/SequenceFileMeta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env python
# ========================================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from hadoop.io.SequenceFile import CompressionType
from hadoop.io.SequenceFile import Metadata
from hadoop.io import LongWritable
from hadoop.io import SequenceFile

def writeData(writer):
key = LongWritable()
value = LongWritable()

for i in xrange(10):
key.set(1000 - i)
value.set(i)
print '[%d] %s %s' % (writer.getLength(), key.toString(), value.toString())
writer.append(key, value)

def testWrite(filename):
metadata = Metadata()
metadata.set('Meta Key 0', 'Meta Value 0')
metadata.set('Meta Key 1', 'Meta Value 1')

writer = SequenceFile.createWriter(filename, LongWritable, LongWritable, metadata)
writeData(writer)
writer.close()

def testRead(filename):
reader = SequenceFile.Reader(filename)

metadata = reader.getMetadata()
for meta_key, meta_value in metadata:
print 'METADATA:', meta_key, meta_value

key_class = reader.getKeyClass()
value_class = reader.getValueClass()

key = key_class()
value = value_class()

position = reader.getPosition()
while reader.next(key, value):
print '*' if reader.syncSeen() else ' ',
print '[%6s] %6s %6s' % (position, key.toString(), value.toString())
position = reader.getPosition()

reader.close()

if __name__ == '__main__':
filename = 'test-meta.seq'
testWrite(filename)
testRead(filename)
43 changes: 43 additions & 0 deletions examples/SequenceFileReader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env python
# ========================================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys

from hadoop.io import SequenceFile

if __name__ == '__main__':
if len(sys.argv) < 2:
print 'usage: SequenceFileReader <filename>'
else:
reader = SequenceFile.Reader(sys.argv[1])

key_class = reader.getKeyClass()
value_class = reader.getValueClass()

key = key_class()
value = value_class()

#reader.sync(4042)
position = reader.getPosition()
while reader.next(key, value):
print '*' if reader.syncSeen() else ' ',
print '[%6s] %6s %6s' % (position, key.toString(), value.toString())
position = reader.getPosition()

reader.close()

45 changes: 45 additions & 0 deletions examples/SequenceFileWriterDemo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env python
# ========================================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from hadoop.io.SequenceFile import CompressionType
from hadoop.io import LongWritable
from hadoop.io import SequenceFile

def writeData(writer):
key = LongWritable()
value = LongWritable()

for i in xrange(1000):
key.set(1000 - i)
value.set(i)
print '[%d] %s %s' % (writer.getLength(), key.toString(), value.toString())
writer.append(key, value)

if __name__ == '__main__':
writer = SequenceFile.createWriter('test.seq', LongWritable, LongWritable)
writeData(writer)
writer.close()

writer = SequenceFile.createWriter('test-record.seq', LongWritable, LongWritable, compression_type=CompressionType.RECORD)
writeData(writer)
writer.close()

writer = SequenceFile.createWriter('test-block.seq', LongWritable, LongWritable, compression_type=CompressionType.BLOCK)
writeData(writer)
writer.close()

54 changes: 54 additions & 0 deletions examples/SetFileTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env python
# ========================================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from hadoop.io.IntWritable import IntWritable
from hadoop.io import SetFile

if __name__ == '__main__':
writer = SetFile.Writer('set-test', IntWritable)
writer.INDEX_INTERVAL = 16
for i in xrange(0, 100, 2):
writer.append(IntWritable(i * 10))
writer.close()

key = IntWritable()
reader = SetFile.Reader('set-test')
while reader.next(key):
print key

print 'GET 8'
key.set(8)
print reader.get(key)
print

print 'GET 120'
key.set(120)
print reader.get(key)
print

print 'GET 240'
key.set(240)
print reader.get(key)
print

print 'GET 550'
key.set(550)
print reader.get(key)
print

reader.close()
35 changes: 35 additions & 0 deletions examples/TestText.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env python
# ========================================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from hadoop.io.SequenceFile import CompressionType
from hadoop.io import Text
from hadoop.io import SequenceFile

def writeData(writer):
key = Text()
value = Text()

key.set('Key')
value.set('Value')

writer.append(key, value)

if __name__ == '__main__':
writer = SequenceFile.createWriter('test.seq', Text, Text)
writeData(writer)
writer.close()
Loading

0 comments on commit 74b758b

Please sign in to comment.