1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
| from torchvision.models.detection import FasterRCNN_MobileNet_V3_Large_FPN_Weights, fasterrcnn_mobilenet_v3_large_fpn import torch import torchvision.transforms as T import cv2 import datetime
def main(): input_path = "input.mp4" output_path = "output.mp4"
weights = FasterRCNN_MobileNet_V3_Large_FPN_Weights.DEFAULT model = fasterrcnn_mobilenet_v3_large_fpn(weights=weights, box_score_thresh=0.9) model = model.cuda() model.eval() cap = cv2.VideoCapture(input_path) fps = cap.get(cv2.CAP_PROP_FPS) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'mp4v') out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
start = datetime.datetime.now() while cap.isOpened(): ret, frame = cap.read() if not ret: break
transform = T.Compose([T.ToTensor()]) img_tensor = transform(frame).unsqueeze(0) img_tensor = img_tensor.cuda()
with torch.no_grad(): prediction = model(img_tensor)
boxes = prediction[0]['boxes'].cpu().numpy() labels = [weights.meta["categories"][i] for i in prediction[0]["labels"]]
for box, label in zip(boxes, labels): box = list(map(int, box)) frame = cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2) frame = cv2.putText(frame, f'Label: {label}', (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
out.write(frame) cv2.imshow('Object Detection', frame) if cv2.waitKey(1) & 0xFF == ord('q'): break
end = datetime.datetime.now() print(end-start) cap.release() cv2.destroyAllWindows()
if __name__ == "__main__": main()
|