当前位置：首页 > article >正文

关于一些整理图像及视频数据的代码块

article 2025/2/18 7:32:46

关于一些整理图像及视频数据的代码块，包括统计文件夹里每个视频的时长输出表格，将多个数据文件夹的图片整理成表格展示，整理大于多少张的文件夹数据，统计多个文件夹的图片数据，对不规则的图像进行padding填充灰边

1.统计文件夹里每个视频的时长输出表格

需求：统计某个文件夹里所有视频的时长找到视频最多的时长区间

#-*- coding:utf-8 -*-
import os
import datetime
import sys
import argparse
from moviepy.editor import VideoFileClip
import pandas as pd
from itertools import groupby

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Compute Total Time of a Series of Videos')
    parser.add_argument("--path", metavar="PATH", default="/data/sku_data/algorithm_dataset/30sku_test_20210811_same_sku/",
                        help="the root path of the videos(default: .)")
    parser.add_argument("--type", metavar="TYPE", default=".mp4",
                        help="the type of the videos(default: .mp4)")
    args = parser.parse_args()
    filelist = []
    for a, b, c in os.walk(args.path):
        for name in c:
            fname = os.path.join(a, name)
            if fname.endswith(args.type):
                filelist.append(fname)
    ftime = 0.0
    video_seconds=[]
    for item in filelist:
        clip = VideoFileClip(item)
        video_name = item.split('/')[-1]
        seconds = datetime.timedelta(seconds=clip.duration).total_seconds()
        # video_seconds.append([video_name,seconds])
        video_seconds.append(seconds)
    #分段统计，每个60s
    for k, g in groupby(sorted(video_seconds), key=lambda x: x // 60):
        print('{}-{}: {}'.format(k*60, (k + 1) * 60, len(list(g))))
    #具体的时长和文件名表格生成
    # column_name = ['video_name', 'seconds_count']
    # csv_name = 'video_seconds_counts.csv'
    # xml_df = pd.DataFrame(video_seconds, columns=column_name)
    # xml_df.to_csv(csv_name, index=None)
        #统计总时长
        # ftime += clip.duration
    # print("%d seconds: " % ftime,str(datetime.timedelta(seconds=ftime)))

2.将多个数据文件夹的图片整理成表格展示

需求：有个excel表格有label及label_number列，根据label将底库数据抽取一张放在对应的行展示

    import os
    import xlsxwriter
    from operator import itemgetter
    import pandas as pd

    path1="/data2/v6-data/new_gallery_11.xlsx"
    df1=pd.read_excel(path1,sheet_name="Sheet1")
    dict1=dict(zip(df1["label"],df1["label_number"]))

    folder = '/data2/all_padding/'

    def find_img_path(sku_folder):
        images = []
        for file in os.listdir(sku_folder):
            if file.endswith('.jpg'):
                path = os.path.join(sku_folder, file)
                size = os.path.getsize(path)
                images.append((file, size))
        images = sorted(images, key=itemgetter(1), reverse=True)
        img_path=os.path.join(sku_folder, images[0][0])
        return img_path

    img_path_all=[]
    for filename in os.listdir(folder):
        sku_folder=os.path.join(folder,filename)
        if sku_folder.split("/")[-1]=="facebank.pth" or sku_folder.split("/")[-1]=="names.npy":
            continue
        img_path_all.append(find_img_path(sku_folder))
        # print(sku_folder)
        # print(find_img_path(sku_folder))
    # print(img_path_all[0])
    wb = xlsxwriter.Workbook('/data2/all_new_bj_clear/new_skuid_pic_.xls')   #打开excel
    pictureSheet = wb.add_worksheet("Sheet1")
    pictureSheet.set_column('C:C', 16)
    # pictureSheet.set_column('D:D', 16)
    for i in range(0,len(img_path_all)):
        print(img_path_all[i].split("/")[-2])
        print(img_path_all[i].split("/")[-2],dict1[int(img_path_all[i].split("/")[-2])])  
        cell = 'A%d' % i
        cell2 = 'B%d' % i
        cell3 = 'C%d' % i
        # cell4 = 'D%d' % i
        pictureSheet.set_row(i, 95)
        pictureSheet.write(cell, img_path_all[i].split("/")[-2])
        pictureSheet.write(cell2, dict1[int(img_path_all[i].split("/")[-2])])
        # tureSheet.insert_image(cell,img_path,{'x_offset': 5, 'y_offset': 1,'x_scale': 0.5, 'y_scale': 0.5})
        pictureSheet.insert_image(cell3,img_path_all[i],{'x_offset': 5, 'y_offset': 1,'x_scale': 0.5, 'y_scale': 0.5})  # 指定A3格，x、y为缩放比例
        # pictureSheet.insert_image(cell4,img_path_all[i],{'x_offset': 5, 'y_offset': 1,'x_scale': 0.5, 'y_scale': 0.5})
    wb.close()

3.整理大于多少张的文件夹数据

需求：将文件夹里大于100张的文件夹进行移动汇总

    import os
    import shutil
    path2='/data2/clear/'
    path='/data2/crop/more500/'
    # path3=''
    count = []
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        # print(file_path)
        if os.path.isdir(file_path):
            if len(os.listdir(file_path)) > 500:
                count.append(filename)
            # print(f"{file_path} is a directory")
        else:
            print(f"{file_path} is not a directory")
            continue
    print(len(count))
    ###移动文件
    # result = [elem for elem in count1 if elem in count]
    for i in count:
        print(i)
        older_path=os.path.join(path,i)
    # # # # # # # # #     shutil.rmtree(older_path)
        new_path=os.path.join(path2,i)
        print(older_path,new_path)
        shutil.move(older_path,new_path)

4.统计多个文件夹的图片数据

需求：整理的检索数据，统计每个样本量

import os
    from openpyxl import Workbook
    folder_path = '/data2/v4-data/baojian_all_gallery/'
    def count_images_in_folder(folder_path):
        image_extensions = ['.jpg', '.jpeg', '.png', '.gif'] # 支持的图片格式
        count = 0
        for file_name in os.listdir(folder_path):
            file_ext = os.path.splitext(file_name)[1].lower() # 获取文件扩展名
            if file_ext in image_extensions:
                count += 1
        return count
    def get_subfolder_paths(folder_path):
        subfolder_paths = []
        for subfolder_name in os.listdir(folder_path):
            subfolder_path = os.path.join(folder_path, subfolder_name)
            if os.path.isdir(subfolder_path):
                subfolder_paths.append(subfolder_path)
        return subfolder_paths
    subfolder_paths = get_subfolder_paths(folder_path)
    image_counts = []
    for subfolder_path in subfolder_paths:
        image_count = count_images_in_folder(subfolder_path)
        image_counts.append((subfolder_path, image_count))
    image_counts.sort(key=lambda x: x[1], reverse=True)
    wb = Workbook()
    ws = wb.active
    ws.cell(row=1, column=1, value='文件夹名字')
    ws.cell(row=1, column=2, value='图片数量')
    for i, (subfolder_path, image_count) in enumerate(image_counts):
        ws.cell(row=i+2, column=1, value=os.path.basename(subfolder_path))
        ws.cell(row=i+2, column=2, value=image_count)
    wb.save('/data2/gallery.xlsx')

5.对不规则的图像进行padding填充灰边

需求：图像归一化224*224

    import os
    import cv2 

    BLACK = [158,160,161]
    INPUT_DIR = "/data2/all_new_bj_clear/all/"
    OUTPUT_DIR = "/data2/all_new_bj_clear/all_padding/"
    TARGET_SIZE = 224

    os.makedirs(OUTPUT_DIR, exist_ok=True)

    for root, dirs, files in os.walk(INPUT_DIR):
        for filename in files:
            try:
                filename_ori = os.path.join(root, filename)
                print(filename_ori)
                image = cv2.imread(filename_ori)
                if image is None:
                    print(f"Failed to read image: {filename_ori}")
                    continue

                height, width, _ = image.shape
                r0 = height / TARGET_SIZE
                r1 = width / TARGET_SIZE

                if r0 > r1:
                    width = int(width / r0)
                    height = TARGET_SIZE
                else:
                    width = TARGET_SIZE
                    height = int(height / r1)

                res = cv2.resize(image, (width, height), interpolation=cv2.INTER_CUBIC)

                pading_h = int((TARGET_SIZE - height) / 2)
                pading_w = int((TARGET_SIZE - width) / 2)

                constant = cv2.copyMakeBorder(res, pading_h, pading_h, pading_w, pading_w, cv2.BORDER_CONSTANT, value=BLACK)

                output_root = root.replace(INPUT_DIR, OUTPUT_DIR)
                os.makedirs(output_root, exist_ok=True)
                out_filename = os.path.join(output_root, filename)
                cv2.imwrite(out_filename, constant)
                print(f"Saved: {out_filename}")

            except Exception as e:
                print(f"Error processing {filename}: {e}")