当前位置: 首页 > article >正文

HF上的 llava-med-zh-instruct-60k 数据预处理代码

  • https://huggingface.co/datasets/BUAADreamer/llava-med-zh-instruct-60k
import pandas as pd
import cv2
import numpy as np
import json

pokeman = pd.read_parquet('data/train-00000-of-00014.parquet')
print(pokeman)

# 处理文本
img_name = 0
save_data_list = []
for index, msg in enumerate(pokeman['messages']):
    save_context_list = []
    for index_text in range(len(msg)):
        if index_text==0:
            save_context = {
                "content": "<image>\n"+msg[index_text]['content'],
                "role": msg[index_text]['role']
            }
        else:
            save_context = {
                "content": msg[index_text]['content'],
                "role": msg[index_text]['role']
            }
        save_context_list.append(save_context)

    save_data = {
        "messages": save_context_list,
        "images": [
            f"llava_image/{img_name}.jpg"
        ]
    }

    save_data_list.append(save_data)

    img_name = img_name+1

with open("./llava_med_data.json", 'w', encoding="utf-8") as f:
    json.dump(save_data_list, f, ensure_ascii=False, indent=4)
    
# 处理图片
# for index, img in enumerate(pokeman['images']):
    # image_bytes = img[0]['bytes']
    # # 将bytes数据转换为numpy矩阵
    # image_np = cv2.imdecode(np.frombuffer(image_bytes, dtype=np.uint8), cv2.IMREAD_COLOR)
    # # 将numpy矩阵保存为jpg格式的图片文件
    # cv2.imwrite(f"image/{index}.jpg", image_np)

批量处理代码

import pandas as pd
import cv2
import numpy as np
import json
from tqdm import tqdm
import os

img_name = 0
save_data_list = []

os.makedirs("./llava_image", exist_ok=True)

for index_multi_filename in range(14):
    file_path = f'data/train-000{str(index_multi_filename).zfill(2)}-of-00014.parquet'
    print(file_path)
    try:
        pokeman = pd.read_parquet(file_path)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        continue

    for index, (msg, img) in tqdm(enumerate(zip(pokeman['messages'], pokeman['images'])), total=len(pokeman)):
        save_context_list = []
        for index_text in range(len(msg)):
            content_prefix = "<image>\n" if index_text == 0 else ""
            save_context = {
                "content": content_prefix + msg[index_text]['content'],
                "role": msg[index_text]['role']
            }
            save_context_list.append(save_context)

        save_data = {
            "messages": save_context_list,
            "images": [
                f"llava_image/{img_name}.jpg"
            ]
        }

        save_data_list.append(save_data)

        try:
            image_bytes = img[0]['bytes']
            image_np = cv2.imdecode(np.frombuffer(image_bytes, dtype=np.uint8), cv2.IMREAD_COLOR)
            cv2.imwrite(f"llava_image/{img_name}.jpg", image_np)
        except Exception as e:
            print(f"Error processing image at index {index} from {file_path}: {e}")

        img_name += 1  # Move this line here to ensure unique names for each image.

with open("./llava_med_data.json", 'w', encoding="utf-8") as f:
    json.dump(save_data_list, f, ensure_ascii=False, indent=4)

http://www.kler.cn/a/370317.html

相关文章:

  • Linux系统之kill命令的基本使用
  • 【云岚到家】-day03-门户缓存实现实战
  • postman请求参数化
  • Windows11电脑总是一闪一闪的,黑一下亮一些怎么解决
  • Vue进阶之旅:核心技术与页面应用实战(路由进阶)
  • java基础概念59-File
  • K8s-DashBoard部署与管理
  • 【Java语言】类和对象
  • 车载导航测试:确保驾驶者的精准导航体验
  • Java项目实战II基于微信小程序的医院管理系统(开发文档+数据库+源码)
  • Laravel5 抓取第三方网站图片,存储到本地
  • Stable Diffusion视频插件Ebsynth Utility安装方法
  • npm设置镜像源
  • JavaScript 的 axios 实现文件下载功能
  • NVR批量管理软件/平台EasyNVR多个NVR同时管理支持UDP和TCP传输协议
  • 海外著名门户媒体发稿之科技时报Tech Times - 大舍传媒
  • LED显示屏模组七大参数解析
  • 【面试题系列】MySQL 中 GROUP BY 和 DISTINCT 有什么区别?
  • Git (Linux)
  • 15分钟学 Go 第 24 天:并发基础 - Channels
  • Golang | Leetcode Golang题解之第508题出现次数最多的子树元素和
  • 郑州面试得问题
  • 宇视设备视频平台EasyCVR视频融合平台果园/鱼塘/养殖场/菜园有电没网视频监控方案
  • iOS AVAudioSession 详解【音乐播放器的配置】
  • 调用detr-resnet-50进行目标检测
  • JetBrains IDE中GPU进程(JCEF)重启问题(Too many restarts of GPU-process)解决方案