如何增加ai虚拟聊天伴侣趣味性——OpenCV识别大量真实聊天图片采集高质量语料

发布时间:2023年12月18日

图像转文字

代码一:(文件run_batch.py)

批量读取多个目录下的所有pdf多分页图片和jpg图片,并实现OCR识别图片文字,分别保存到多个json文件中

import os
import json
import base64, re
from tqdm import tqdm
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.ocr.v20181119 import ocr_client, models
import fitz  # PyMuPDF
import numpy as np

secret_id = "" #腾讯云OCR服务密钥,开通服务可以免费使用1000次
secret_key = ""

def get_imges(pdf_path):
    if pdf_path.endswith('pdf'):
        pdf_document = fitz.open(pdf_path)
        page_count = pdf_document.page_count
        # pdf_writer = fitz.open()

        for page_num in range(pdf_document.page_count):
            page = pdf_document[page_num]
            pix = page.get_pixmap().tobytes('png',jpg_quality=1680)
            base64_str = base64.b64encode(pix).decode()
            # return pix
            yield base64_str
        # pdf_writer.save(output_path, deflate=True, jpg_quality=30)
        pdf_document.close()
        print("page_count",page_count)
        return page_count
    else:
        with open(pdf_path, 'rb') as i_file:
            base64_str = base64.b64encode(i_file.read()).decode()
            yield base64_str
        


def make_api_call(jpg_fpath, jsn_fpath, json_data):
    for base64_str in get_imges(jpg_fpath):
        try:
            cred = credential.Credential(secret_id, secret_key)
            httpProfile = HttpProfile()
            httpProfile.endpoint = "ocr.tencentcloudapi.com"
            clientProfile = ClientProfile()
            clientProfile.httpProfile = httpProfile
            client = ocr_client.OcrClient(cred, "ap-guangzhou", clientProfile)
            req = models.GeneralAccurateOCRRequest()
            params = {
                'LanguageType': 'zh',
                'IsPdf': True,
                "PdfPageNumber": 5,
                'ImageBase64': f'data:image/jpeg;base64,{base64_str}',
                # 'EnableDetectText': True
            }
            req.from_json_string(json.dumps(params))
            resp = client.GeneralBasicOCR(req)
            res = json.loads(resp.to_json_string()).get('TextDetections')
            print("res lenght:",len(res))
            json_data['TextDetections'].append(res)
            # break
        except TencentCloudSDKException as err:
            print(err)
    if json_data:
        print("len(json_data['TextDetections']):",len(json_data['TextDetections']))
        os.makedirs(os.path.dirname(os.path.realpath(jsn_fpath)), exist_ok=True)
        with open(jsn_fpath, 'w', encoding='UTF-8') as o_file:
            o_file.write(json.dumps(json_data, ensure_ascii=False))

def ocr_api_json(jpg_dpath):

    curr_dir = os.path.dirname(os.path.realpath(__file__))
    image_path = os
文章来源:https://blog.csdn.net/qq_20163065/article/details/135052800
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。