Subtitling videos with AI using Google Chirp

Shaun Keenan
Zencore Engineering
4 min readAug 7, 2023

I have a few thousand video files, each around an hour, that I needed to create subtitles for. Google’s speech-to-text API offers a few different models, one of which is Chirp, a 2B parameter model, which can detect over 100 languages, and does a great job of recognizing music as well. Most of my files have both music and speech in them, so Chirp was the perfect choice.

In order to use Google STT API, you need to provide audio-only files to the API, so the first order of business is ripping the audio out of all of these video files. I used ffmpeg for this, with the python ffmpeg library.

video = "some_file.mp4"
# trim .mp4 off of the filename
video_name = video.split('.')[0]
# rip audio from video
stream = ffmpeg.input(video)
mp3 = "{}.mp3".format(video_name)
audio = ffmpeg.output(stream, mp3)

The STT API requires files to be in a GCS bucket, so here we’ll upload them

# Set up the storage client
storage_client = storage.Client()
bucket_name = "my_bucket"
bucket = storage_client.bucket(bucket_name)
# upload audio to cloud storage
blob = bucket.blob(mp3)
print("Uploaded {} to Cloud Storage.".format(mp3))

Now, let’s send a job to the batch recognizer (Chirp). We’re using the “batch” API, which is slower, but significantly cheaper ($0.003/min) than the standard one (up to $0.016/min)

# The output path of the transcription result.
workspace = "gs://{}/transcripts".format(bucket_name)

# The name of the audio file to transcribe:
gcs_uri = "gs://{}/{}".format(bucket_name, mp3, model="chirp")
# Recognizer resource name:
recognizer = "projects/{}/locations/us-central1/recognizers/_".format(project)
print("Transcribing {}...".format(gcs_uri))
# Call our transcribe function (see below for source)
batch_recognize_results = transcribe(workspace, gcs_uri, recognizer)

This code block calls a “transcribe” function, which can be found below

from import storage
from google.api_core.client_options import ClientOptions
from import SpeechClient
from import cloud_speech
import re
import srt
import datetime

def transcribe(workspace, gcs_uri, recognizer, model="chirp"):
client = SpeechClient(
config = cloud_speech.RecognitionConfig(
output_config = cloud_speech.RecognitionOutputConfig(
files = [cloud_speech.BatchRecognizeFileMetadata(
request = cloud_speech.BatchRecognizeRequest(
recognizer=recognizer, config=config, files=files, recognition_output_config=output_config
operation = client.batch_recognize(request=request,timeout=1200)
result = operation.result(timeout=1200)
file_results = result.results[gcs_uri]
output_bucket, output_object = re.match(
r"gs://([^/]+)/(.*)", file_results.uri
).group(1, 2)
storage_client = storage.Client()
bucket = storage_client.bucket(output_bucket)
blob = bucket.blob(output_object)
results_bytes = blob.download_as_bytes()
batch_recognize_results = cloud_speech.BatchRecognizeResults.from_json(
results_bytes, ignore_unknown_fields=True
return batch_recognize_results

Now, we have a transcript, but we need to convert it into an SRT (subtitle) file in order to use it with the original videos

subFile = "{}.srt".format(video)
print("Generating SRT file for {}...".format(video))
subs = subtitle_generation(batch_recognize_results, subFile)

and, here is our subtitle_generation function. Some of this code partially comes from another example I found, which used Google STTv1 API. This uses the python SRT library as well. Credit below

def subtitle_generation(speech_to_text_response, outfile, bin_size=3):
"""We define a bin of time period to display the words in sync with audio.
Here, bin_size = 3 means each bin is of 3 secs.
All the words in the interval of 3 secs in result will be grouped togather."""
transcriptions = []
index = 0

# Credit to:
for result in speech_to_text_response.results:
if result.alternatives[0].words[0].start_offset.seconds:
# bin start -> for first word of result
start_sec = result.alternatives[0].words[0].start_offset.seconds
start_microsec = result.alternatives[0].words[0].start_offset.microseconds
# bin start -> For First word of response
start_sec = 0
start_microsec = 0
end_sec = start_sec + bin_size # bin end sec

# for last word of result
last_word_end_sec = result.alternatives[0].words[-1].end_offset.seconds
last_word_end_microsec = result.alternatives[0].words[-1].end_offset.microseconds

# bin transcript
transcript = result.alternatives[0].words[0].word

index += 1 # subtitle index

for i in range(len(result.alternatives[0].words) - 1):
word = result.alternatives[0].words[i + 1].word
word_start_sec = result.alternatives[0].words[i + 1].start_offset.seconds
word_start_microsec = result.alternatives[0].words[i + 1].start_offset.microseconds
word_end_sec = result.alternatives[0].words[i + 1].end_offset.seconds
word_end_microsec = result.alternatives[0].words[i + 1].end_offset.microseconds

if word_end_sec < end_sec:
transcript = transcript + " " + word
previous_word_end_sec = result.alternatives[0].words[i].end_offset.seconds
previous_word_end_microsec = result.alternatives[0].words[i].end_offset.microseconds

# append bin transcript
transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, previous_word_end_sec, previous_word_end_microsec), transcript))

# reset bin parameters
start_sec = word_start_sec
start_microsec = word_start_microsec
end_sec = start_sec + bin_size
transcript = result.alternatives[0].words[i + 1].word

index += 1
except IndexError:
# append transcript of last transcript in bin
transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, last_word_end_sec, last_word_end_microsec), srt.make_legal_content(transcript)))
index += 1
except IndexError:

# turn transcription list into subtitles and write to the SRT file
subFile = open(outfile, "w")
subArr = []
for sub in transcriptions:

return transcriptions

Lastly, let’s upload the resulting SRT file to GCS

storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob("{}.srt".format(video_name))
print("Uploaded {} to Cloud Storage.".format(subFile))

You can find all of this in a GitHub repo here. Processing time would be faster if we batched the files into groups rather than submitting one at a time. Watch the repo for updates, I’ll take care of this soon.

