frame_paths, original_frames = frame_extract(args.video, out_dir=tmp_dir.name) # extract all frames
sample interval
window_size = clip_len * frame_interval, here clip_len is the number of frames in the window, frame_interval is the interval of frames in window.
1 2 3 4 5
window_size = clip_len * frame_interval assert clip_len % 2 == 0, 'We would like to have an even clip_len' # Note that it's 1 based here timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2, args.predict_stepsize)
for i in range(len(human_detections)): det = human_detections[i] det[:, 0:4:2] *= w_ratio det[:, 1:4:2] *= h_ratio human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)
SpatioTemporal Action Detection
get all frames in a window according to a target frame(timestamp):
1 2 3 4
start_frame = timestamp - (clip_len // 2 - 1) * frame_interval frame_inds = start_frame + np.arange(0, window_size, frame_interval) frame_inds = list(frame_inds - 1) imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
get the result of SpatioTemporal Action Detection:
datasample = ActionDataSample() datasample.proposals = InstanceData(bboxes=proposal) datasample.set_metainfo(dict(img_shape=(new_h, new_w))) with torch.no_grad(): result = model(input_tensor, [datasample], mode='predict') scores = result[0].pred_instances.scores prediction = [] # N proposals for i in range(proposal.shape[0]): prediction.append([]) # Perform action score thr for i in range(scores.shape[1]): if i notin label_map: continue for j in range(proposal.shape[0]): if scores[j, i] > args.action_score_thr: prediction[j].append((label_map[i], scores[j,i].item())) predictions.append(prediction)