- https://huggingface.co/datasets/BUAADreamer/llava-med-zh-instruct-60k
import pandas as pd
import cv2
import numpy as np
import json
pokeman = pd.read_parquet('data/train-00000-of-00014.parquet')
print(pokeman)
img_name = 0
save_data_list = []
for index, msg in enumerate(pokeman['messages']):
save_context_list = []
for index_text in range(len(msg)):
if index_text==0:
save_context = {
"content": "<image>\n"+msg[index_text]['content'],
"role": msg[index_text]['role']
}
else:
save_context = {
"content": msg[index_text]['content'],
"role": msg[index_text]['role']
}
save_context_list.append(save_context)
save_data = {
"messages": save_context_list,
"images": [
f"llava_image/{img_name}.jpg"
]
}
save_data_list.append(save_data)
img_name = img_name+1
with open("./llava_med_data.json", 'w', encoding="utf-8") as f:
json.dump(save_data_list, f, ensure_ascii=False, indent=4)
批量处理代码
import pandas as pd
import cv2
import numpy as np
import json
from tqdm import tqdm
import os
img_name = 0
save_data_list = []
os.makedirs("./llava_image", exist_ok=True)
for index_multi_filename in range(14):
file_path = f'data/train-000{str(index_multi_filename).zfill(2)}-of-00014.parquet'
print(file_path)
try:
pokeman = pd.read_parquet(file_path)
except Exception as e:
print(f"Error reading {file_path}: {e}")
continue
for index, (msg, img) in tqdm(enumerate(zip(pokeman['messages'], pokeman['images'])), total=len(pokeman)):
save_context_list = []
for index_text in range(len(msg)):
content_prefix = "<image>\n" if index_text == 0 else ""
save_context = {
"content": content_prefix + msg[index_text]['content'],
"role": msg[index_text]['role']
}
save_context_list.append(save_context)
save_data = {
"messages": save_context_list,
"images": [
f"llava_image/{img_name}.jpg"
]
}
save_data_list.append(save_data)
try:
image_bytes = img[0]['bytes']
image_np = cv2.imdecode(np.frombuffer(image_bytes, dtype=np.uint8), cv2.IMREAD_COLOR)
cv2.imwrite(f"llava_image/{img_name}.jpg", image_np)
except Exception as e:
print(f"Error processing image at index {index} from {file_path}: {e}")
img_name += 1
with open("./llava_med_data.json", 'w', encoding="utf-8") as f:
json.dump(save_data_list, f, ensure_ascii=False, indent=4)