由于一些行业需要手动录入大量的身份证信息, 因此编写本软件用于减少工作量。
下载地址:
https://pan.xunlei.com/s/VORZO0ORVI_f5-QvI3vutGNgA1?pwd=6jpp#

代码:
- import re
- import threading
- import tkinter as tk
- from tkinter import filedialog, messagebox, ttk
-
- from openpyxl import Workbook
- from openpyxl.drawing.image import Image as XLImage
- from openpyxl.styles import Alignment
- from paddlenlp import Taskflow
- from paddleocr import PaddleOCR
-
- # 初始化 OCR 和 NLP 模型
- ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=False)
- schema = ['地址', '姓名', '时间']
- ie = Taskflow('information_extraction', schema=schema, model='uie-base')
-
- def extract_info_from_image(image_path):
- try:
- result = ocr.ocr(image_path, cls=True)
-
- all_text = ""
- for res in result:
- for line in res:
- text = line[1][0]
- if text:
- all_text += text + " "
-
- # 提取结构化信息
- name = nation = birth = id_number = ""
-
- # 提取民族和身份证号(通过关键词匹配)
- for res in result:
- for i, line in enumerate(res):
- text = line[1][0]
- if '民族' in text:
- match = re.search(r'民族(.+)', text)
- if match:
- nation = match.group(1).strip()
- if '公民身份号码' in text:
- if i + 1 < len(res):
- next_text = res[i + 1][1][0]
- if re.match(r'\d{17}[\dXx]', next_text):
- id_number = next_text.strip()
-
- # 使用 IE 模型提取姓名、出生日期
- ie_result = ie(all_text)
- if ie_result and isinstance(ie_result, list):
- info = ie_result[0]
- if '姓名' in info and info['姓名']:
- name = info['姓名'][0]['text']
- if '时间' in info and info['时间']:
- birth = info['时间'][0]['text']
-
- return {
- "姓名": name,
- "民族": nation,
- "出生日期": birth,
- "身份证号": id_number,
- "图片路径": image_path
- }
-
- except Exception as e:
- print(f"处理 {image_path} 失败: {e}")
- return None
-
- class OCRApp:
- def __init__(self, root):
- self.root = root
- self.root.title("身份证批量识别工具")
- self.root.geometry("400x200")
-
- self.label = tk.Label(root, text="点击下方按钮选择身份证图片", font=("Arial", 14))
- self.label.pack(pady=20)
-
- self.btn = tk.Button(root, text="选择图片", command=self.start_batch_ocr)
- self.btn.pack(pady=10)
-
- self.progress = ttk.Progressbar(root, orient="horizontal", length=300, mode="indeterminate")
- self.progress.pack(pady=10)
-
- def start_batch_ocr(self):
- # 开始文件选择
- file_paths = filedialog.askopenfilenames(
- title="选择身份证图片",
- filetypes=[("Image Files", "*.jpg *.jpeg *.png *.bmp")]
- )
-
- if not file_paths:
- return
-
- # 显示进度条
- self.progress.start()
- self.btn.config(state=tk.DISABLED)
-
- # 在子线程中处理 OCR
- thread = threading.Thread(target=self.process_files, args=(file_paths,))
- thread.start()
-
- def process_files(self, file_paths):
- wb = Workbook()
- ws = wb.active
- ws.append(["图片", "姓名", "民族", "出生日期", "身份证号"])
-
- row_idx = 2 # 从第二行开始插入数据
-
- for path in file_paths:
- info = extract_info_from_image(path)
- if info:
- ws.cell(row=row_idx, column=2, value=info["姓名"])
- ws.cell(row=row_idx, column=3, value=info["民族"])
- ws.cell(row=row_idx, column=4, value=info["出生日期"])
- ws.cell(row=row_idx, column=5, value=info["身份证号"])
-
- try:
- img = XLImage(path)
- img.width = 500
- img.height = 300
- ws.row_dimensions[row_idx].height = img.height # 设置行高为图片高度
- ws.add_image(img, f"A{row_idx}")
-
- # 设置 A 列宽度为图片宽度(近似值)
- ws.column_dimensions['A'].width = img.width * 0.14 # 粗略换算像素到 Excel 列宽单位
- except Exception as e:
- print(f"无法插入图片 {path}: {e}")
-
- # 所有列居中对齐
- for col in range(1, 6): # A~E 列
- cell = ws.cell(row=row_idx, column=col)
- cell.alignment = Alignment(horizontal='center', vertical='center')
-
- row_idx += 1
-
- # 设置表头居中
- for col in range(1, 6):
- header_cell = ws.cell(row=1, column=col)
- header_cell.alignment = Alignment(horizontal='center', vertical='center')
-
- output_path = "身份证识别结果.xlsx"
- wb.save(output_path)
-
- self.progress.stop()
- self.btn.config(state=tk.NORMAL)
- messagebox.showinfo("完成", f"识别完成,结果已保存至 {output_path}")
-
- if __name__ == "__main__":
- root = tk.Tk()
- app = OCRApp(root)
- root.mainloop()