SSENSE-TECH
Published in

SSENSE-TECH

CSV vs Parquet vs Avro: Choosing the Right Tool for the Right Job

Image Source

The Contenders

  1. CSV
  2. Parquet
  3. Avro

CSV

timestamp,product_id,product_score1555480889,46260,0.33818901664441661555480889,85461,0.347867006784373431555480889,85462,0.29329979524843851555480889,113723,0.38037360862177531555480889,113724,0.396843752266727631555480889,113727,0.38768682121073605

Final result

Larger Data Sets

Parquet

var schema = new parquet.ParquetSchema({
memberID: { type: 'UINT_32'},
brand_color: {
repeated: true,
fields: {
brand: { type: 'UINT_32'},
color: { type: 'UTF8' },
score: { type: 'FLOAT'}
}
},
timestamp: { type: 'TIMESTAMP_MILLIS' }
});
var schema = new parquet.ParquetSchema({
memberID: { type: 'UINT_32'},
brand_color: {
repeated: true,
fields: {
brand: { type: 'UINT_32', compression: 'SNAPPY'},
color: { type: 'UTF8', compression: 'SNAPPY' },
score: { type: 'FLOAT', compression: 'SNAPPY'}
}
},
timestamp: { type: 'TIMESTAMP_MILLIS' },
});
async function testParquetRead(){
let reader = await parquet.ParquetReader.openFile('test.parquet');
let cursor = reader.getCursor(['memberID']);
let record = null;
let i = 0;
let lastone;

while(record = await cursor.next()) {
lastone = record;
i++;
}
await reader.close();
console.log(lastone);
console.log(i);
}

Avro

var schema = new avro.Type.forSchema({
type: 'record',
fields: [
{name: 'memberID', type:'int'},
{
name: 'brand_color',
type: {
type: 'array',
items: {
"type": "record",
"name":"brandColor",
"fields":[
{name: 'brand', type: 'int'},
{name: 'color', type: 'string'},
{name: 'score', type: 'float'}
]
},
}
},
{name: 'timestamp', type:'long'}
]
});
var writer = avro.createFileEncoder(‘test.avro’, schema);
var writer = avro.createFileEncoder(‘test.avro’, schema, {codec: ‘snappy’, codecs: {snappy: snappy.compress}});
function testAvroRead(schema){
let i =0;
console.time('read')
let lastone;
avro.createFileDecoder('test.avro', {codecs: {"snappy": snappy.uncompress }})
.on('metadata', (data)=> {
console.log(data);
})
.on('data', (row) => {
lastone = schema.toString(row);
i++;
})
.on('end', () => {
console.log(i);
console.log(lastone)
console.timeEnd('read')
});
}

Conclusion

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Mikhail Levkovsky

Code. Ship. Repeat. Build great things with great people. cofounder @configtree