How to Use Data Pipelines with Python

In this article, I will talk about what is a data pipeline, how to use a data pipeline with Python, and I will show examples to create a data pipeline using TensorFlow.

Tirendaz Academy
Aug 27 · 9 min read
Photo by Jeremy Bishop on Unsplash

What is Data Pipeline?

The Data Pipeline for Text Datasets

Photo by Romain Vignes on Unsplash
main_directory/
....pos
........pst1.txt
........pst2.txt
....neg
........ngt1.txt
........ngt2.txt

Downloading the Dataset

import io
import os
import re
import shutil
import string
import tensorflow as tf
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
data = tf.keras.utils.get_file(
"aclImdb_v1.tar.gz",
url,
untar=True,
cache_dir='.',
cache_subdir='')
data_dir = os.path.join(os.path.dirname(data), 'aclImdb')
train_dir = os.path.join(data_dir, 'train')
os.listdir(train_dir)
unused_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(unused_dir)
os.listdir(train_dir)
ls aclImdb\train

Creating the Data Pipeline

batch_size = 1024
seed = 123
train_ds = tf.keras.preprocessing.text_dataset_from_directory(    
'aclImdb/train',
batch_size = batch_size,
validation_split=0.2,
subset='training',
seed=seed)
val_ds = tf.keras.preprocessing.text_dataset_from_directory( 
'aclImdb/train',
batch_size=batch_size,
validation_split=0.2,
subset='validation',
seed=seed)

Exploring the Dataset

import random
idx = random.sample(range(1, batch_size), 5)
for text_batch, label_batch in train_ds.take(1):
for i in idx:
print(label_batch[i].numpy(),
text_batch.numpy()[i])

The Data Pipeline for Image Datasets

Photo by virginia lackinger on Unsplash

Exploring the Dataset

import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
traindf=pd.read_csv(
'flower_photos/all_labels.csv',
dtype=str)
# Take a look first five fows of the dataset
traindf.head()

Creating the Data Pipeline

data_root = 'flower_photos/flowers'
IMAGE_SIZE = (224, 224)
TRAINING_DATA_DIR = str(data_root)
BATCH_SIZE = 32
datagen_kwargs = dict(
rescale=1./255,
validation_split=.20)
dataflow_kwargs = dict(
target_size=IMAGE_SIZE,
batch_size=BATCH_SIZE,
interpolation="bilinear")
train_datagen =   tf.keras.preprocessing.image.ImageDataGenerator(
**datagen_kwargs)
train_generator = train_datagen.flow_from_dataframe(
dataframe=traindf,
directory=data_root,
x_col="file_name",
y_col="label",
subset="training",
seed=10, shuffle=True,
class_mode="categorical",
**dataflow_kwargs)

Inspecting the Dataset

image_batch, label_batch = next(iter(train_generator))
fig, axes = plt.subplots(8, 4, figsize=(20, 40))
axes = axes.flatten()
for img, lbl, ax in zip(image_batch, label_batch, axes):
ax.imshow(img)
label_ = np.argmax(lbl)
label = idx_labels[label_]
ax.set_title(label)
ax.axis('off')
plt.show()

Training the Model

model = tf.keras.Sequential([
tf.keras.layers.InputLayer(
input_shape=IMAGE_SIZE + (3,)),
hub.KerasLayer(
"https://tfhub.dev/tensorflow/resnet_50/feature_vector/1", trainable=False),
tf.keras.layers.Dense(
5, activation='softmax',
name = 'custom_class')])
model.build([None, 224, 224, 3])
model.compile(
optimizer=tf.keras.optimizers.SGD(lr=0.005, momentum=0.9),
loss=tf.keras.losses.CategoricalCrossentropy(
from_logits=True, label_smoothing=0.1),
metrics=['accuracy'])
steps_per_epoch = train_generator.samples // train_generator.batch_sizevalidation_steps = valid_generator.samples // valid_generator.batch_sizemodel.fit(
train_generator,
epochs=13,
steps_per_epoch=steps_per_epoch,
validation_data=valid_generator,
validation_steps=validation_steps)

Data Pipeline for NumPy Array Datasets

Photo by Angela Bailey on Unsplash

Loading the Dataset

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
print(type(train_images), type(train_labels))
print(train_images.shape, train_labels.shape)

Exploring the Dataset

plt.figure()
plt.imshow(train_images[5])
plt.colorbar()
plt.grid(False)
plt.show()

Preprocessing the Datasets

train_images = train_images/255
train_dataset = tf.data.Dataset.from_tensor_slices(
(train_images, train_labels))
SHUFFLE_BUFFER_SIZE = 10000
TRAIN_BATCH_SIZE = 50
VALIDATION_BATCH_SIZE = 10000
#Creating the data pipelines
validation_ds = train_dataset.shuffle( SHUFFLE_BUFFER_SIZE).take( VALIDATION_SAMPLE_SIZE).batch(VALIDATION_BATCH_SIZE)
train_ds = train_dataset.skip( VALIDATION_BATCH_SIZE).batch( TRAIN_BATCH_SIZE).repeat()

Building the Model

# Build the model
model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(30, activation='relu'),
tf.keras.layers.Dense(10)
# Compiling the model
model.compile(
optimizer=tf.keras.optimizers.RMSprop(),
loss=tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True),
metrics=['sparse_categorical_accuracy'])
#Trainging the model
model.fit(
train_ds,
epochs=13,
steps_per_epoch=steps_per_epoch,
validation_data=validation_ds,
validation_steps=validation_steps)

Summary

MLearning.ai

Data Scientists must think like an artist when finding a solution

MLearning.ai

Data Scientists must think like an artist when finding a solution, when creating a piece of code.Artists enjoy working on interesting problems, even if there is no obvious answer.

Tirendaz Academy

Written by

Data Science | AI | Machine Learning | Deep Learning | Programming | Bioinformatics

MLearning.ai

Data Scientists must think like an artist when finding a solution, when creating a piece of code.Artists enjoy working on interesting problems, even if there is no obvious answer.