fix: correct yolo cls model preprocess

Joker1212 · Joker1212 · commit a5167be2c0dd · 2024-10-29T13:37:19.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,5 @@ long1.jpg
 .DS_Store
 *.npy
 outputs/
+/tests/test_files/standard_dataset/
+/lineless_table_rec/images/
diff --git a/README.md b/README.md
@@ -15,10 +15,11 @@
 ### 最近更新
 - **2024.10.13**
   - 补充最新paddlex-SLANet-plus 测评结果(已集成模型到[RapidTable](https://github.com/RapidAI/RapidTable)仓库)
-- **2024.10.17**
-  - 补充最新surya 表格识别测评结果
 - **2024.10.22**
-  - 补充复杂背景多表格检测提取方案[RapidTableDet](https://github.com/RapidAI/RapidTableDetection)   
+  - 补充复杂背景多表格检测提取方案[RapidTableDet](https://github.com/RapidAI/RapidTableDetection)
+- **2024.10.29**
+  - 使用yolo11重新训练表格分类器，修正wired_table_rec v2逻辑坐标还原错误，并更新测评
+    
 ### 简介
 💖该仓库是用来对文档中表格做结构化识别的推理库，包括来自阿里读光有线和无线表格识别模型，llaipython(微信)贡献的有线表格模型，网易Qanything内置表格分类模型等。
 
@@ -57,10 +58,10 @@
 | [deepdoctection(rag-flow)](https://github.com/deepdoctection/deepdoctection?tab=readme-ov-file) |   0.59975   |       0.69918       |
 | [ppstructure_table_master](https://github.com/PaddlePaddle/PaddleOCR/tree/main/ppstructure)     |   0.61606   |       0.73892       |
 | [ppsturcture_table_engine](https://github.com/PaddlePaddle/PaddleOCR/tree/main/ppstructure)     |   0.67924   |       0.78653       |
-| table_cls + wired_table_rec v1 + lineless_table_rec                                             |   0.68507   |       0.75140       |
 | [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)                     |   0.67310   |       0.81210       |
 | [RapidTable(SLANet)](https://github.com/RapidAI/RapidTable)                                     |   0.71654   |       0.81067       |
-| table_cls + wired_table_rec v2 + lineless_table_rec                                             |   0.73702   |       0.80210       |
+| table_cls + wired_table_rec v1 + lineless_table_rec                                             |   0.75288   |       0.82574       |
+| table_cls + wired_table_rec v2 + lineless_table_rec                                             |   0.77676   |       0.84580       |
 | [RapidTable(SLANet-plus)](https://github.com/RapidAI/RapidTable)                                | **0.84481** |     **0.91369**     |
 
 ### 使用建议
@@ -87,6 +88,8 @@ from wired_table_rec import WiredTableRecognition
 lineless_engine = LinelessTableRecognition()
 wired_engine = WiredTableRecognition()
 table_cls = TableCls()
+# 分类精度降低，但耗时减少 3/5(0.2s->0.08s)
+# table_cls = TableCls(mode="q")
 img_path = f'images/img14.jpg'
 
 cls,elasp = table_cls(img_path)
@@ -158,7 +161,8 @@ for i, res in enumerate(result):
 - [x] 图片小角度偏移修正方法补充
 - [x] 增加数据集数量，增加更多评测对比
 - [x] 补充复杂场景表格检测和提取，解决旋转和透视导致的低识别率
-- [ ] 优化表格分类器，优化无线表格模型
+- [x] 优化表格分类器
+- [ ] 优化无线表格模型
 
 ### 处理流程
 
diff --git a/table_cls/main.py b/table_cls/main.py
@@ -5,7 +5,7 @@
 import numpy as np
 from PIL import Image
 
-from .utils import InputType, LoadImage, OrtInferSession, ResizePad
+from .utils import InputType, LoadImage, OrtInferSession, resize_and_center_crop
 
 cur_dir = Path(__file__).resolve().parent
 q_cls_model_path = cur_dir / "models" / "table_cls.onnx"
@@ -64,10 +64,15 @@ class YoloCls:
     def __init__(self, model_path):
         self.table_cls = OrtInferSession(model_path)
         self.cls = {0: "wireless", 1: "wired"}
+        self.mean = np.array([0, 0, 0], dtype=np.float32)
+        self.std = np.array([1, 1, 1], dtype=np.float32)
 
     def preprocess(self, img):
-        img, *_ = ResizePad(img, 640)
-        img = np.array(img, dtype=np.float32) / 255.0
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = resize_and_center_crop(img, 640)
+        img = np.array(img, dtype=np.float32) / 255
+        img -= self.mean
+        img /= self.std
         img = img.transpose(2, 0, 1)  # HWC to CHW
         img = np.expand_dims(img, axis=0)  # Add batch dimension, only one image
         return img
diff --git a/table_cls/utils.py b/table_cls/utils.py
@@ -180,17 +180,33 @@ def verify_exist(file_path: Union[str, Path]):
             raise LoadImageError(f"{file_path} does not exist.")
 
 
-def ResizePad(img, target_size):
-    h, w = img.shape[:2]
-    m = max(h, w)
-    ratio = target_size / m
-    new_w, new_h = int(ratio * w), int(ratio * h)
-    img = cv2.resize(img, (new_w, new_h), cv2.INTER_LINEAR)
-    top = (target_size - new_h) // 2
-    bottom = (target_size - new_h) - top
-    left = (target_size - new_w) // 2
-    right = (target_size - new_w) - left
-    img1 = cv2.copyMakeBorder(
-        img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
-    )
-    return img1, new_w, new_h, left, top
+def resize_and_center_crop(image: np.ndarray, target_size: int):
+    """
+    Resize the image so that the smallest side is equal to the target size,
+    then crop the center of the image to the specified target size.
+
+    Args:
+        image (np.ndarray): Input image as a NumPy array with shape (height, width, channels).
+        target_size (int): Target size for the smallest side of the image and the output size.
+
+    Returns:
+        (np.ndarray): Resized and cropped image as a NumPy array.
+    """
+    # 获取输入图像的尺寸
+    h, w = image.shape[:2]
+
+    # 计算缩放比例
+    scale = target_size / min(h, w)
+    new_h, new_w = int(h * scale), int(w * scale)
+
+    # 缩放图像
+    resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+
+    # 计算裁剪的起始位置
+    i = (new_h - target_size) // 2
+    j = (new_w - target_size) // 2
+
+    # 裁剪图像
+    cropped_image = resized_image[i : i + target_size, j : j + target_size]
+
+    return cropped_image