Subtitling videos with AI using Google Chirp

Shaun Keenan
Zencore Engineering
4 min readAug 7, 2023

I have a few thousand video files, each around an hour, that I needed to create subtitles for. Google’s speech-to-text API offers a few different models, one of which is Chirp, a 2B parameter model, which can detect over 100 languages, and does a great job of recognizing music as well. Most of my files have both music and speech in them, so Chirp was the perfect choice.

Skip to the code: https://github.com/skeenan947/google-stt-medium

In order to use Google STT API, you need to provide audio-only files to the API, so the first order of business is ripping the audio out of all of these video files. I used ffmpeg for this, with the python ffmpeg library.

video = "some_file.mp4"
# trim .mp4 off of the filename
video_name = video.split('.')[0]
# rip audio from video
stream = ffmpeg.input(video)
mp3 = "{}.mp3".format(video_name)
audio = ffmpeg.output(stream, mp3)
ffmpeg.run(audio)

The STT API requires files to be in a GCS bucket, so here we’ll upload them

# Set up the storage client
storage_client = storage.Client()
bucket_name = "my_bucket"
bucket = storage_client.bucket(bucket_name)
# upload audio to cloud storage
blob = bucket.blob(mp3)
blob.upload_from_filename(mp3)
print("Uploaded {} to Cloud Storage.".format(mp3))

Now, let’s send a job to the batch recognizer (Chirp). We’re using the “batch” API, which is slower, but significantly cheaper ($0.003/min) than the standard one (up to $0.016/min)

# The output path of the transcription result.
workspace = "gs://{}/transcripts".format(bucket_name)

# The name of the audio file to transcribe:
gcs_uri = "gs://{}/{}".format(bucket_name, mp3, model="chirp")
# Recognizer resource name:
recognizer = "projects/{}/locations/us-central1/recognizers/_".format(project)
print("Transcribing {}...".format(gcs_uri))
# Call our transcribe function (see below for source)
batch_recognize_results = transcribe(workspace, gcs_uri, recognizer)

This code block calls a “transcribe” function, which can be found below

from google.cloud import storage
from google.api_core.client_options import ClientOptions
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech
import re
import srt
import datetime

def transcribe(workspace, gcs_uri, recognizer, model="chirp"):
client = SpeechClient(
client_options=ClientOptions(
api_endpoint="us-central1-speech.googleapis.com",
)
)
config = cloud_speech.RecognitionConfig(
auto_decoding_config={},
language_codes=["en-US"],
model=model,
features=cloud_speech.RecognitionFeatures(
enable_word_time_offsets=True,
),
)
output_config = cloud_speech.RecognitionOutputConfig(
gcs_output_config=cloud_speech.GcsOutputConfig(
uri=workspace),
)
files = [cloud_speech.BatchRecognizeFileMetadata(
uri=gcs_uri
)]
request = cloud_speech.BatchRecognizeRequest(
recognizer=recognizer, config=config, files=files, recognition_output_config=output_config
)
operation = client.batch_recognize(request=request,timeout=1200)
result = operation.result(timeout=1200)
file_results = result.results[gcs_uri]
output_bucket, output_object = re.match(
r"gs://([^/]+)/(.*)", file_results.uri
).group(1, 2)
storage_client = storage.Client()
bucket = storage_client.bucket(output_bucket)
blob = bucket.blob(output_object)
results_bytes = blob.download_as_bytes()
batch_recognize_results = cloud_speech.BatchRecognizeResults.from_json(
results_bytes, ignore_unknown_fields=True
)
return batch_recognize_results

Now, we have a transcript, but we need to convert it into an SRT (subtitle) file in order to use it with the original videos

subFile = "{}.srt".format(video)
print("Generating SRT file for {}...".format(video))
subs = subtitle_generation(batch_recognize_results, subFile)

and, here is our subtitle_generation function. Some of this code partially comes from another example I found, which used Google STTv1 API. This uses the python SRT library as well. Credit below

def subtitle_generation(speech_to_text_response, outfile, bin_size=3):
"""We define a bin of time period to display the words in sync with audio.
Here, bin_size = 3 means each bin is of 3 secs.
All the words in the interval of 3 secs in result will be grouped togather."""
transcriptions = []
index = 0

# Credit to: https://github.com/darshan-majithiya/Generate-SRT-File-using-Google-Cloud-s-Speech-to-Text-API/
for result in speech_to_text_response.results:
try:
if result.alternatives[0].words[0].start_offset.seconds:
# bin start -> for first word of result
start_sec = result.alternatives[0].words[0].start_offset.seconds
start_microsec = result.alternatives[0].words[0].start_offset.microseconds
else:
# bin start -> For First word of response
start_sec = 0
start_microsec = 0
end_sec = start_sec + bin_size # bin end sec

# for last word of result
last_word_end_sec = result.alternatives[0].words[-1].end_offset.seconds
last_word_end_microsec = result.alternatives[0].words[-1].end_offset.microseconds

# bin transcript
transcript = result.alternatives[0].words[0].word

index += 1 # subtitle index

for i in range(len(result.alternatives[0].words) - 1):
try:
word = result.alternatives[0].words[i + 1].word
word_start_sec = result.alternatives[0].words[i + 1].start_offset.seconds
word_start_microsec = result.alternatives[0].words[i + 1].start_offset.microseconds
word_end_sec = result.alternatives[0].words[i + 1].end_offset.seconds
word_end_microsec = result.alternatives[0].words[i + 1].end_offset.microseconds

if word_end_sec < end_sec:
transcript = transcript + " " + word
else:
previous_word_end_sec = result.alternatives[0].words[i].end_offset.seconds
previous_word_end_microsec = result.alternatives[0].words[i].end_offset.microseconds

# append bin transcript
transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, previous_word_end_sec, previous_word_end_microsec), transcript))

# reset bin parameters
start_sec = word_start_sec
start_microsec = word_start_microsec
end_sec = start_sec + bin_size
transcript = result.alternatives[0].words[i + 1].word

index += 1
except IndexError:
pass
# append transcript of last transcript in bin
transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, last_word_end_sec, last_word_end_microsec), srt.make_legal_content(transcript)))
index += 1
except IndexError:
pass

# turn transcription list into subtitles and write to the SRT file
subFile = open(outfile, "w")
subArr = []
for sub in transcriptions:
subArr.append(sub.to_srt())

subFile.writelines(subArr)
subFile.close()
return transcriptions

Lastly, let’s upload the resulting SRT file to GCS

storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob("{}.srt".format(video_name))
blob.upload_from_filename(subFile)
print("Uploaded {} to Cloud Storage.".format(subFile))

You can find all of this in a GitHub repo here. Processing time would be faster if we batched the files into groups rather than submitting one at a time. Watch the repo for updates, I’ll take care of this soon.

--

--