import os import cv2 import math import numpy as np import pandas as pd from pdf2image import convert_from_bytes import streamlit as st def get_img(uploaded_file): # convert file bytes into cv2 image file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8) img = cv2.imdecode(file_bytes, 1) return img def convert_pdf_to_image(filename): # * returns back a list of images according to the pdf pages pdf_pages = convert_from_bytes(filename, 500) return pdf_pages def filter_color(img): hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) # define range of black color in HSV lower_val = np.array([0, 0, 0]) upper_val = np.array([179, 100, 130]) # Threshold the HSV image to get only black colors mask = cv2.inRange(hsv, lower_val, upper_val) # Bitwise-AND mask and original image res = cv2.bitwise_not(mask) return res def plot(img, boxes): FONT_SCALE = 1e-3 THICKNESS_SCALE = 1e-3 TEXT_Y_OFFSET_SCALE = 2.5e-2 height, width, _ = img.shape font_scale = min(width, height) * FONT_SCALE thickness = math.ceil(min(width, height) * THICKNESS_SCALE) tmp = img.copy() for box in boxes: top_left = (int(box[0]), int(box[1])) bottom_right = (int(box[2]), int(box[3])) tmp = cv2.rectangle(tmp, top_left, bottom_right, (0, 0, 255), thickness) text = str(round(float(box[4]), 2)) cv2.putText( tmp, text, (int(box[0]), int(box[1]) + int(height * TEXT_Y_OFFSET_SCALE)), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 0, 255), thickness, ) return tmp def delete_file(filename): if os.path.exists(filename): os.remove(filename) def save_excel_file( idx, df: pd.DataFrame, foldername, filename, page_enumeration: int = 0 ): df.to_csv( f"{foldername}/{filename}page{page_enumeration}table{idx}.csv", index=False, ) def concat_csv(folder, filename: str): df = pd.DataFrame() foldername = folder.name files = list( sorted( os.listdir(foldername), key=lambda x: x.split("page")[1].split("table")[0] ) ) columns = [] for idx, file in enumerate(files): tmp = pd.read_csv(f"{foldername}/{file}") try: if idx == 0: columns = tmp.iloc[0] df = pd.concat([df, tmp[1:]]) except: continue if not df.empty: df.columns = columns st.dataframe(df) df.to_csv(filename, index=False)