Perfecting Imperfect Table Gridlines: A Step-by-Step Guide using Python OpenCV

Have you ever come across a table that lacked complete or missing lines, or worse, had no gridlines at all? If so, this article is specifically tailored for you.

Lidor ES
6 min readFeb 24, 2023

Here I’ll show you how you can draw horizontal and vertical gridlines for your table image and if necessary I’ll show you how to remove them all (for example if you have broken lines, or if some are missing) to draw them better using the algorithm I’ll show you!

Table of Contents

  1. Introduction
  2. Goal
  3. Remove Gridlines
  4. Draw Vertical and Horizontal Gridlines
  5. Crop Image According to Table Bounding Boxes
  6. Apply all of the above
  7. Show Image by Stages
  8. End Notes

Introduction

Many times we have images with all sorts of tables, some with borders and grid lines, some with missing or broken parts, and some without gridlines and borders at all.

Here I will try to come up with a generic solution that hopefully will work on all of the above cases.

All throughout this article when ever I say gridlines I refer to all of the table gridlines and it’s borders.

Goal

My goal here is to remove all of the table gridlines and borders from the image and then redraw them in the proper locations and proper lengths artificially.

Remove Gridlines

def removeLines(old_image: np.ndarray, axis) -> np.ndarray:
gray = cv2.cvtColor(old_image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
if axis == "horizontal":
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 25))
elif axis == "vertical":
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))
else:
raise ValueError("Axis must be either 'horizontal' or 'vertical' in order to work")
detected_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations = 2)
contours = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = contours[0] if len(contours) == 2 else contours[1]
result = old_image.copy()
for contour in contours:
cv2.drawContours(result, [contour], -1, (255, 255, 255), 2)
return result

Function walkthrough:

  • Converting the input image to grayscale
  • Applying Otsu’s thresholding algorithm, read more about it here
  • Checking if we need a horizontal or a vertical kernel
  • Applying a morphological opening to the thresholded image
  • Finding the contours
  • Drawing complete white lines on top of the detected gridlines contours

Draw Vertical and Horizontal Gridlines

def drawGridlines(old_image: np.ndarray) -> dict:
# Get dimensions
hh_, ww_ = old_image.shape[:2]
# Convert image to grayscale
gray = cv2.cvtColor(old_image, cv2.COLOR_BGR2GRAY)
# Threshold on white - binary
thresh = cv2.threshold(gray, 220, 255, cv2.THRESH_BINARY)[1]
# Resize thresh image to a single row
row = cv2.resize(thresh, (ww_, 1), interpolation = cv2.INTER_AREA)
# Threshold on white
thresh_row = cv2.threshold(row, 254, 255, cv2.THRESH_BINARY)[1]
# Apply small amount of morphology to merge with column of text
kernel = cv2.getStructuringElement(cv2.MORPH_RECT , (5, 1))
thresh_row = cv2.morphologyEx(thresh_row, cv2.MORPH_OPEN, kernel)
# Get vertical contours
contours_v = cv2.findContours(thresh_row, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours_v = contours_v[0] if len(contours_v) == 2 else contours_v[1]
full_grid_image = old_image.copy()
vertical_img = old_image.copy()
for contour in contours_v:
x, y, w, h = cv2.boundingRect(contour)
xcenter = x + w // 2
cv2.line(vertical_img, (xcenter, 0), (xcenter, hh_ - 1), (0, 0, 0), 1)
cv2.line(full_grid_image, (xcenter, 0), (xcenter, hh_ - 1), (0, 0, 0), 1)
# Resize thresh image to a single column
column = cv2.resize(thresh, (1, hh_), interpolation = cv2.INTER_AREA)
# Threshold on white - binary
thresh_column = cv2.threshold(column, 254, 255, cv2.THRESH_BINARY)[1]
# Get horizontal contours
contours_h = cv2.findContours(thresh_column, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours_h = contours_h[0] if len(contours_h) == 2 else contours_h[1]
horizontal_img = old_image.copy()
for contour in contours_h:
x, y, w, h = cv2.boundingRect(contour)
ycenter = y + h // 2
cv2.line(horizontal_img, (0, ycenter), (ww_ - 1, ycenter), (0, 0, 0), 1)
cv2.line(full_grid_image, (0, ycenter), (ww_ - 1, ycenter), (0, 0, 0), 1)
# Return results as a dictionary
return {
'threshold': thresh,
'vertical': vertical_img,
'horizontal': horizontal_img,
'full': full_grid_image
}

The general function walkthrough is (function has comments that explain everything):

  • Thresholding, processing, and getting horizontal contours
  • Drawing horizontal contours
  • Thresholding, processing, and getting vertical contours
  • Drawing vertical contours

Crop Image According to Table Bounding Boxes

def cropImage(old_image: np.ndarray) -> np.ndarray:
# Get dimensions
hh, ww = old_image.shape[:2]
# Convert to gray
gray = cv2.cvtColor(old_image, cv2.COLOR_BGR2GRAY)
# Threshold
thresh = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY)[1]
# Crop 1 pixel and add 1 pixel white border to ensure outer white regions not considered the small contours
thresh = thresh[1: hh - 1, 1 : ww - 1]
thresh = cv2.copyMakeBorder(thresh, 1, 1, 1, 1, borderType = cv2.BORDER_CONSTANT, value = (255, 255, 255))
# Get contours
contours = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours = contours[0] if len(contours) == 2 else contours[1]
# Get min and max x and y from all bounding boxes larger than half of the image size
thresh_area = hh * ww / 2
xmin = ww
ymin = hh
xmax = 0
ymax = 0
for contour in contours:
area = cv2.contourArea(contour)
if area < thresh_area:
x, y, w, h = cv2.boundingRect(contour)
xmin = x if (x < xmin) else xmin
ymin = y if (y < ymin) else ymin
xmax = x + w - 1 if (x + w - 1 > xmax ) else xmax
ymax = y + h - 1 if (y + h - 1 > ymax) else ymax
# Draw bounding box
bounding_box = old_image.copy()
cv2.rectangle(bounding_box, (xmin, ymin), (xmax, ymax), (0, 0, 255), 2)
# Crop old_image at the bounding box, but add 2 all around to keep the black lines
result = old_image[ymin : ymax, xmin : xmax]
return result

Function walkthrough (read function comments to know more):

  • Converting the input image to grayscale
  • Applying a binary threshold
  • Ensure the outer white region
  • Getting the contours
  • Iterate over all the contours and compute the minimum and maximum values of the x and y coordinates of their bounding boxes
  • Draw a rectangle to represent the bounding box around the region of interest
  • Crop image according to the bounding box

Apply all of the above

Here I will show you an example of a main file, using all of the above functions and showing the result

import cv2
import matplotlib.pyplot as plt
from GridlinesImprovement.draw_gridlines_functions import drawGridlines
from GridlinesImprovement.cropping import cropImage
from GridlinesImprovement.remove_gridlines import removeLines


image_path = "Model Implementation/DummyDatabase/test_images/bad_grid.png"
new_image_path = "Model Implementation/DummyDatabase/test_images/image_grided.png"
# Read image
original = cv2.imread(image_path)
# Remove all gridlines
gridless = removeLines(removeLines(original, 'horizontal'), 'vertical')
# Draw grid lines
images_by_stage = drawGridlines(gridless.copy())
"""
images_by_stage: (dict)
'threshold': threshold image
'vertical': vertical grid lines image
'horizontal': horizontal grid lines image
'full': full grid lines image
"""
# Obtain full grid image
full_image = images_by_stage['full'].copy()
# Crop image
cropped_image = cropImage(full_image.copy())
# Save new image
cv2.imwrite(new_image_path, cropped_image)

Show Image by Stages

f1, ax1  = plt.subplots(2, 2, figsize = (15,5))
ax1[0][0].set_title('Input Image')
ax1[0][0].imshow(original_image)
ax1[0][1].set_title('images["vertical"]')
ax1[0][1].imshow(images["vertical"])
ax1[1][0].set_title('images["horizontal"]')
ax1[1][0].imshow(images["horizontal"])
ax1[1][1].set_title('images["full"]')
ax1[1][1].imshow(images["full"])
f2, ax2 = plt.subplots(1, 1, figsize = (8,5))
ax2.set_title('Final Image')
ax2.imshow(result)
plt.show()
Image by Stages

End Notes

This article is an improvement idea I had when I needed to extract the tabular data and I run into a problem, the problem was whenever I had a table with broken gridlines or with no gridlines at all the extraction algorithm didn't work the best so I had this idea to artificially draw the table gridlines.

The order of the 3 articles that are related to each other is as follows:

  1. Table Detection and Extraction — TableNet, Deep Learning model with PyTorch from images
  2. This article
  3. Image Table to DataFrame using Python OCR

Full code can be found in my GitHub repo

--

--

Lidor ES

Data Scientist & Engineer and Software Engineering student