Creating Interactive AI Programs Using the Gemini API

Astropomeai
4 min readDec 16, 2023

With the advent of Google’s latest multimodal AI, Gemini, a new user experience integrating text, voice, and image processing has become possible. In this article, we discuss a program created using the Gemini Pro API, released on December 13, 2023, that communicates with Gemini by sending real-time images and prompts. Our goal was to emulate the sophistication of Google DeepMind’s demos.

Program Overview

This program is an interactive application integrating camera input, image recognition, text generation, and speech synthesis. The user experience progresses through the following steps:

Camera Input Acquisition:

The user activates the camera and captures video frames in real time, allowing the program to visually perceive the user’s surroundings.

User Text Input:

The user inputs text in response to prompts. This input serves as context for the program to generate responses.

Response Generation Using the Gemini Model:

The captured image and user’s text input are sent to Google’s Gemini model. The model combines the information from the image and text to generate relevant text responses.

Speech Synthesis and Playback of Response:

The generated text is converted into speech using the Google Cloud Text-to-Speech API and played back to the user. This provides the user with both visual and auditory feedback.

Adding and Saving Text on the Image:

The generated text is added to the image, helping the user intuitively understand the relationship between the image and the text. The processed image is saved locally for future reference.

User Experience Flow

  1. Start the program and capture the surroundings through the camera.
  2. Enter text based on the prompt.
  3. Combine the input and camera image to generate a response using the Gemini model.
  4. Listen to the generated text in speech form.
  5. Save the image containing the generated text for a visual record.

Code examples in Python, including functions for text-to-speech conversion, text wrapping, adding text to frames, saving frames, and sending frames with text to Gemini, are provided.

import cv2
import os
from collections import deque
from datetime import datetime
from pydub import AudioSegment
from pydub.playback import play
import google.generativeai as genai
from google.cloud import texttospeech
import PIL.Image

def text_to_speech_google(text, client):
# Setting up the text-to-speech request
synthesis_input = texttospeech.SynthesisInput(text=text)
voice = texttospeech.VoiceSelectionParams(
language_code="en-US",
ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
)
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)

# Sending the text-to-speech request
response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)

# Saving the speech data to a file
with open("output.mp3", "wb") as out:
out.write(response.audio_content)

# Loading the MP3 file
sound = AudioSegment.from_mp3("output.mp3")
# Playing the sound
play(sound)

def wrap_text(text, line_length):
"""Wrap the text at the specified length"""
words = text.split(' ')
lines = []
current_line = ''

for word in words:
if len(current_line) + len(word) + 1 > line_length:
lines.append(current_line)
current_line = word
else:
current_line += ' ' + word

lines.append(current_line) # Add the last line
return lines

def add_text_to_frame(frame, text):
# Wrap the text every 70 characters
wrapped_text = wrap_text(text, 70)

# Get the height and width of the frame
height, width = frame.shape[:2]

# Setting the font and size for the text
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 1.0 # Increase font size
color = (255, 255, 255) # White color
outline_color = (0, 0, 0) # Outline color (black)
thickness = 2
outline_thickness = 4 # Thickness of the outline
line_type = cv2.LINE_AA

# Adding each line of text to the image
for i, line in enumerate(wrapped_text):
position = (10, 30 + i * 30) # Adjust the position of each line (larger interval)

# Drawing the outline of the text
cv2.putText(frame, line, position, font, font_scale, outline_color, outline_thickness, line_type)

# Drawing the text
cv2.putText(frame, line, position, font, font_scale, color, thickness, line_type)

def save_frame(frame, filename, directory='./frames'):
# Create the directory if it doesn't exist
if not os.path.exists(directory):
os.makedirs(directory)
# Create the path for the filename
filepath = os.path.join(directory, filename)
# Save the frame
cv2.imwrite(filepath, frame)

def save_temp_frame(frame, filename, directory='./temp'):
# Create the directory if it doesn't exist
if not os.path.exists(directory):
os.makedirs(directory)
# Create the path for the filename
filepath = os.path.join(directory, filename)
# Save the frame
cv2.imwrite(filepath, frame)
return filepath # Return the path of the saved file

def send_frame_with_text_to_gemini(frame, previous_texts, timestamp, user_input, client):

temp_file_path = save_temp_frame(frame, "temp.jpg")
img = PIL.Image.open(temp_file_path)

# Combine past texts as context
context = ' '.join(previous_texts)

# Initialize the Gemini model
model = client.GenerativeModel('gemini-pro-vision')

# Send image and text instructions to the model
prompt = f"Given the context: {context} and the current time: {timestamp}, please respond to the following message without repeating the context. Message: {user_input}"
response = model.generate_content([prompt, img], stream=True)
response.resolve()

# Return the generated text
return response.text

def main():

genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
# Initialize the Google Cloud TTS API client
client = texttospeech.TextToSpeechClient()

try:
video = cv2.VideoCapture(0)
if not video.isOpened():
raise IOError("Could not open the camera.")
except IOError as e:
print(f"An error occurred: {e}")
return

# Queue to hold the text of the last 5 frames
previous_texts = deque(maxlen=5)

while True:

print("Enter a new prompt or press Enter to continue (type 'exit' to end the program):")
user_input = input().strip() # Receive input

if not user_input:
user_input = "Tell me what you see."

success, frame = video.read()
if not success:
print("Failed to read the frame.")
break

# Get the current timestamp
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

# Send the frame to Gemini and get the generated text
generated_text = send_frame_with_text_to_gemini(frame, previous_texts, timestamp, user_input, genai)
print(f"Timestamp: {timestamp}, Generated Text: {generated_text}")

# Add the text with timestamp to the queue
previous_texts.append(f"[{timestamp}] Message: {user_input}, Generated Text: {generated_text}")

# Add text to the frame (Japanese characters may not display correctly)
text_to_add = f"{timestamp}: {generated_text}"

add_text_to_frame(frame, text_to_add)

# Save the frame
filename = f"{timestamp}.jpg"
save_frame(frame, filename)

# Convert text to speech
text_to_speech_google(generated_text, client)

# Release the video
video.release()
cv2.destroyAllWindows()

if __name__ == "__main__":
main()

Execution Results

As an experiment, we showed the program a hand-drawn picture of a duck, similar to a Google demo.

Prompt: Tell me what you see.

Response : It’s a simple drawing of a duck in water.

--

--