因为服务器cpu比较多,所以可以进行多进程的并行处理任务,定义了48个进程同时跑,单一进程处理一张图片需要3--5分钟,比较耗时。主要任务是从openimage数据集中分割出自己想要的分割数据集。
code:
import os
import cv2
import csv
import numpy as np
from multiprocessing import Pool
import time
train_list = ['train_00', 'train_01', 'train_02', 'train_03', 'train_04', 'train_05', 'train_06', 'train_07', 'train_08']
txt_path = '/home/test_list.txt'
csv_path = '/home/open-image/v5/test-annotations-bbox.csv'
txt_list = []
csv_list = []
img_count = 0
for i in open(txt_path):
txt_list.append(i[:-1])
with open(csv_path,'r') as fp:
csv_list = fp.readlines()
def compute_iou(rect1,rect2):
S_rect1 = (rect1[3] - rect1[1]) * (rect1[2] - rect1[0])
S_rect2 = (rect2[3] - rect2[1]) * (rect2[2] - rect2[0])
sum_area = S_rect1 +S_rect2
left_line = max(rect1[0], rect2[0])
right_line = min(rect1[2], rect2[2])
top_line = max(rect1[1], rect2[1])
bottom_line = min(rect1[3], rect2[3])
if(left_line >= right_line or top_line >= bottom_line):
return 0
else:
intersect = (right_line - left_line) * (bottom_line - top_line)
return intersect / (sum_area - intersect)
def get_index(im_array):
im_h = im_array.shape[0]
im_w = im_array.shape[1]
xx_array = []
yy_array = []
for hh in range(im_h):
for ww in range(im_w):
if(im_array[hh][ww]!=0):
xx_array.append(ww)
yy_array.append(hh)
return np.min(xx_array),np.min(yy_array),np.max(xx_array),np.max(yy_array)
def cpr_box(str_0, img_box, im_height, im_width):
for ii in range(1,len(csv_list)):
csv_line = csv_list[ii]
csv_line = csv_line[:-1]
cpr_str = csv_line.split(',')[0]+csv_line.split(',')[2]
xmin = csv_line.split(',')[4]
xmax = csv_line.split(',')[5]
ymin = csv_line.split(',')[6]
ymax = csv_line.split(',')[7]
xmin = float(xmin)*im_width
xmax = float(xmax)*im_width
ymin = float(ymin)*im_height
ymax = float(ymax)*im_height
search_box = (xmin,ymin,xmax,ymax)
iou = compute_iou(search_box,img_box)
if(cpr_str == str_0 and iou > 0.35):
return xmin,ymin,xmax,ymax
return None
def check_path(image_id):
for ch in train_list:
ppath = '/home/public/openimage/'+ch+'/'+image_id+'.jpg'
if(os.path.exists(ppath) == True):
return ppath
return None
def Processing_Task(TaskID):
for pro_num in range(TaskID*48, TaskID*48+47):
time_start = time.time()
pro_im_path = txt_list[pro_num]
img = cv2.imread(pro_im_path)
img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_x_min, img_y_min, img_x_max, img_y_max = get_index(img_gray)
img_box = (img_x_min, img_y_min, img_x_max, img_y_max)
im_height = img.shape[0]
im_width = img.shape[1]
im_id_cls_hash = pro_im_path.split('/')[-1]
im_id = im_id_cls_hash.split('_')[0]
search_word = im_id + '/m/04_sv'
cpr_set = cpr_box(search_word,img_box,im_height,im_width)
if(cpr_set!=None):
ori_img_path = check_path(im_id)
if(ori_img_path!=None):
x_min, y_min, x_max, y_max = cpr_set[0], cpr_set[1], cpr_set[2], cpr_set[3]
ori_im = cv2.imread(ori_img_path)
im_ori_height = ori_im.shape[0]
im_ori_width = ori_im.shape[1]
x_ori_min = im_ori_width*x_min/im_width
y_ori_min = im_ori_height*y_min/im_height
x_ori_max = im_ori_width*x_max/im_width
y_ori_max = im_ori_height*y_max/im_height
tmp_im = cv2.getRectSubPix(img, (int(x_max - x_min), int(y_max - ymin)), (int((x_min + x_max)/2), int((y_min + y_max)/2)))
tmp_im_ori = cv2.getRectSubPix(img, (int(x_ori_max - x_ori_min), int(y_ori_max - y_ori_min)), (int((x_ori_min + x_ori_max)/2), int((y_ori_min + y_ori_max)/2)))
width_1 = tmp_im.shape[1]
width_2 = tmp_im_ori.shape[1]
width_min = np.min([width_1, width_2])
im_resize = cv2.resize(tmp_im,(width_min,int(width_min * im_height/im_width)))
im_ori_resize = cv2.resize(tmp_im_ori,(width_min, int(width_min * im_height/im_width)))
cv2.imwrite('/home/img/{:09d}.jpg'.format(pro_num),im_ori_resize)
cv2.imwrite('/home/msk/{:09d}.jpg'.format(pro_num),im_resize)
time_end = time.time()
print('TaskID %d costs %0.2f'%(TaskID,time_end - time_start))
if __name__=='__main__':
print('Start....')
p = Pool(48)
for po in range(48):
p.apply_async(Processing_Task, args = (po,))
p.close()
p.join()
这样就很节省时间。如果用单进程去做,大概有12000张图片,最多需要1000个小时,加上以后实际测得一张图片0.2min总共需要40个小时,这效率很可观