opencv_textblocks.Rmd

---
jupyter:
  jupytext:
    text_representation:
      extension: .Rmd
      format_name: rmarkdown
      format_version: '1.2'
      jupytext_version: 1.11.2
  kernelspec:
    display_name: Python (py38)
    language: python
    name: py38
---

```{python}
try:
    import cv2 as cv
except:
    # !pip install OpenCV-Python 
```

```{python}
#https://stackoverflow.com/questions/48768604/how-to-find-a-letter-in-an-image-with-python

```

```{python}
def get_contours(img_path):
    ### load input image and convert it to grayscale
    img = cv.imread(img_path)
    imgray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
    height, width, channels = img.shape

    #### extract all contours
    ret, thresh = cv.threshold(imgray, 127, 255, 0)
    contours, hierarchy  = cv.findContours(thresh, cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE)
    return contours

```

```{python}
img_path = "Screenshot 20210304115007.png"
img = cv.imread(img_path)
height, width, channels = img.shape

contours = get_contours(img_path)

# debug: draw all contours
img_c = cv.drawContours(img.copy(), contours, -1, (0, 0, 255), 1)
cv.imwrite("all_contours.jpg", img_c)
```

```{python}

#### create one bounding box for every contour found
c_list = []
bb_list = []
for c in contours:  
    bb = cv.boundingRect(c)
    # save all boxes except the one that has the exact dimensions of the image (x, y, width, height)
    if (bb[0] == 0 and bb[1] == 0 and bb[2] == img.shape[1] and bb[3] == img.shape[0]):
        continue
    bb_list.append(bb)
    c_list.append(c)

# debug: draw boxes
img_boxes = img.copy()
for c in c_list:
    x,y,w,h = cv.boundingRect(c)
    cv.rectangle(img_boxes, (x, y), (x+w, y+h), (0, 0, 255), 1)

```

```{python}
from operator import itemgetter, attrgetter

#### sort bounding boxes by the X value: first item is the left-most box
y_list = sorted(bb_list, key=itemgetter(0))
xy_list = sorted(y_list, key=itemgetter(1))


x_lines = [0] * width
y_lines = [0] * height

for bb in xy_list:
    x, y, w, h = bb
    for i in range(x, x+w):
        x_lines[i] = 1
    for i in range(y, y+h):
        y_lines[i] = 1
    
pstate = 0
y = 0
y1 = width
for idx, i in enumerate(x_lines):
    if i != pstate:
        print(idx)
        pstate = i
        
        x = idx
        x1 = idx
        cv.line(img_boxes, (x, y), (x1, y1), (255,0,0), 1)

pstate = 0
x = 0
x1 = width
for idx, i in enumerate(y_lines):
    if i != pstate:
        pstate = i

        y = idx
        y1 = idx
        cv.line(img_boxes, (x, y), (x1, y1), (0,255,0), 1)

```

```{python}
cv.imwrite("boxes.jpg", img_boxes) 
```

```{python}
gutter = 10

# using group rectangles...
rects = bb_list.copy()
#print(len(bb_list))
for bb in bb_list:
    x,y,w,h = bb
    rects.append((x,y,w,h))
    rects.append((x - gutter, y - gutter, w + gutter, h + gutter))
print(len(rects))

rects.extend(bb_list)
cv_rects = cv.groupRectangles(rects, 1)
print(len(cv_rects))

# print image...
img_group = img.copy()

for bb in bb_list:
    x,y,w,h = bb
    cv.rectangle(img_boxes, (x, y), (x+w, y+h), (0, 0, 255), 1)

for bb in cv_rects[0]:
    (x,y,w,h) = bb
    cv.rectangle(img_group, (x, y), (x+w, y+h), (0, 255, 0), 1)
        
cv.imwrite("cv.groups.jpg", img_group) 
```

```{python}
class Rectangle(object):
    def __init__(self, x, y, w, h): 
        self._x = x 
        self._y = y 
        self._x2 = w + self.x
        self._y2 = h + self.y
        
    def check_intersect(self, rect, gutter = 0):
        if not rect:
            return False
        
        if not isinstance(rect, Rectangle):
            x,y,w,d = rect
            rect = Rectangle(x,y,w,d)
            
        # If one rectangle is on left side of other 
        #
        #     .---. x2, y2  
        # x,y .___.
        # 
        # this rect is above or right
        if (self._y2) < (rect.y - gutter) or (self._x2) < (rect.x - gutter): 
            return False

        # below or left 
        if (self._x - gutter) > (rect._x) or (self._y- gutter) > (rect._y): 
            return False

        return True
    
    def union(self, rect, gutter=0):
        # handle a opencv python tuple
        if not isinstance(rect, Rectangle):
            x,y,w,d = rect
            rect = Rectangle(x,y,w,d)\
            
        x = min(self._x, )
        
    def intersection(self, rect, gutter=0):
        # handle a opencv python tuple
        if not isinstance(rect, Rectangle):
            x,y,w,d = rect
            rect = Rectangle(x,y,w,d)
        
    
    def expand(self, rect, gutter=0):
        # handle a opencv python tuple
        if not isinstance(rect, Rectangle):
            x,y,w,d = rect
            rect = Rectangle(x,y,w,d)

        if not isinstance(rect, Rectangle):
            x,y,w,d = rect
            rect = Rectangle(x,y,w,d)
            
        if self.x > rect.x:
            self.x = rect.x - gutter

        if self.y > rect.y:
            self.y = rect.y - gutter

        if self.x2 < rect.x2:
            self.x2 = rect.x2 + gutter

        if self.y2 < rect.y2:
            self.y2 = rect.y2 + gutter

    def __str__(self):
        return "Rectangle<{}, {}, {}, {}>".format(self.x, self.y, self.x2, self.y2)
        
    def __repr__(self):
        return "Rectangle<{}, {}, {}, {}>".format(self.x, self.y, self.x2, self.y2)

```

```{python}
def merge_rects(rect_list, gutter = 0):
    """ Returns a list of grouped rectangles
    https://stackoverflow.com/questions/37847923/combine-overlapping-rectangles-python/53894169
    """
    idx = 0
    if rect_list is None:
        return zones
    
    # move through the array using the idx
    while idx < len(rect_list):
        # check for overlap in the rest of the list
        no_overlap = False
        while no_overlap == Flase and len(rect_list) > 1 and idx < len(rect_list):
            rect = rect_list[idx]
            
            # get teh remaining rects for match:
            tmp_rects = np.delete(rect_list, idx, 0)
            
```

```{python}
gutter = 10
contour_areas = []

# scan the area and return the bbbox within the aras + gutter size for the document
# add the first bbox to the array, then scan for any values that are in range...
x,y,w,d = xy_list[0]
contour_areas.append(Rectangle(x,y,w,d))

for idx, bb in enumerate(xy_list):
    if not bb:
        continue
        
    x,y,w,d = bb
    rect = Rectangle(x,y,w,d)
    
    # check for existing rectangles
    add_rect = True
    for g_idx, g_rect in enumerate(contour_areas):
        if not g_rect:
            continue
            
        if g_rect.intersect(rect, gutter):
            # expand the cell
            g_rect.expand(rect, gutter)
            add_rect = False
            break
    
    if add_rect:
        #print("+", rect)
        contour_areas.append(rect)

# plot the new boxes...
img_group = img.copy()
for bb in contour_areas:
    if not bb:
        continue
    cv.rectangle(img_group, (bb.x, bb.y), (bb.x2, bb.y2), (128, 128, 255), 1)

print(contour_areas)
    
# merge the boxes till no more can be merged...
merged_areas = []
while len(contour_areas) > 0:
    rect = contour_areas.pop()
    
    if not rect:
        continue
        
    unique = True
    
    # find all intersecting areas
    for g_idx, g_rect in enumerate(contour_areas):
        if not g_rect:
            continue
            
        if g_rect.intersect(rect, gutter):
            rect.expand(g_rect)
            print(rect, g_rect)
              
    if unique:
        merged_areas.append(rect)
    else:
        pass
    
for bb in merged_areas:
    if not bb:
        continue
    cv.rectangle(img_group, (bb.x, bb.y), (bb.x2, bb.y2), (255, 255, 0), 1)
    
cv.imwrite("groups.jpg", img_group) 
```

```{python}

```

```{python}

```