object_detection_yolo.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. # This code is written at BigVision LLC. It is based on the OpenCV project. It is subject to the license terms in the LICENSE file found in this distribution and at http://opencv.org/license.html
  2. # Usage example: python3 object_detection_yolo.py --video=run.mp4 --device 'cpu'
  3. # python3 object_detection_yolo.py --video=run.mp4 --device 'gpu'
  4. # python3 object_detection_yolo.py --image=bird.jpg --device 'cpu'
  5. # python3 object_detection_yolo.py --image=bird.jpg --device 'gpu'
  6. import cv2 as cv
  7. import argparse
  8. import sys
  9. import numpy as np
  10. import os.path
  11. # Initialize the parameters
  12. confThreshold = 0.5 #Confidence threshold
  13. nmsThreshold = 0.4 #Non-maximum suppression threshold
  14. inpWidth = 416 #Width of network's input image
  15. inpHeight = 416 #Height of network's input image
  16. parser = argparse.ArgumentParser(description='Object Detection using YOLO in OPENCV')
  17. parser.add_argument('--device', default='cpu', help="Device to perform inference on 'cpu' or 'gpu'.")
  18. parser.add_argument('--image', help='Path to image file.')
  19. parser.add_argument('--video', help='Path to video file.')
  20. args = parser.parse_args()
  21. # Load names of classes
  22. classesFile = "coco.names"
  23. classes = None
  24. with open(classesFile, 'rt') as f:
  25. classes = f.read().rstrip('\n').split('\n')
  26. # Give the configuration and weight files for the model and load the network using them.
  27. modelConfiguration = "yolov3.cfg"
  28. modelWeights = "yolov3.weights"
  29. net = cv.dnn.readNetFromDarknet(modelConfiguration, modelWeights)
  30. if(args.device == 'cpu'):
  31. net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
  32. net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
  33. print('Using CPU device.')
  34. elif(args.device == 'gpu'):
  35. net.setPreferableBackend(cv.dnn.DNN_BACKEND_CUDA)
  36. net.setPreferableTarget(cv.dnn.DNN_TARGET_CUDA)
  37. print('Using GPU device.')
  38. # Get the names of the output layers
  39. def getOutputsNames(net):
  40. # Get the names of all the layers in the network
  41. layersNames = net.getLayerNames()
  42. # Get the names of the output layers, i.e. the layers with unconnected outputs
  43. return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()]
  44. # Draw the predicted bounding box
  45. def drawPred(classId, conf, left, top, right, bottom):
  46. # Draw a bounding box.
  47. cv.rectangle(frame, (left, top), (right, bottom), (255, 178, 50), 3)
  48. label = '%.2f' % conf
  49. # Get the label for the class name and its confidence
  50. if classes:
  51. assert(classId < len(classes))
  52. label = '%s:%s' % (classes[classId], label)
  53. #Display the label at the top of the bounding box
  54. labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
  55. top = max(top, labelSize[1])
  56. cv.rectangle(frame, (left, top - round(1.5*labelSize[1])), (left + round(1.5*labelSize[0]), top + baseLine), (255, 255, 255), cv.FILLED)
  57. cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.75, (0,0,0), 1)
  58. # Remove the bounding boxes with low confidence using non-maxima suppression
  59. def postprocess(frame, outs):
  60. frameHeight = frame.shape[0]
  61. frameWidth = frame.shape[1]
  62. # Scan through all the bounding boxes output from the network and keep only the
  63. # ones with high confidence scores. Assign the box's class label as the class with the highest score.
  64. classIds = []
  65. confidences = []
  66. boxes = []
  67. for out in outs:
  68. for detection in out:
  69. scores = detection[5:]
  70. classId = np.argmax(scores)
  71. confidence = scores[classId]
  72. if confidence > confThreshold:
  73. center_x = int(detection[0] * frameWidth)
  74. center_y = int(detection[1] * frameHeight)
  75. width = int(detection[2] * frameWidth)
  76. height = int(detection[3] * frameHeight)
  77. left = int(center_x - width / 2)
  78. top = int(center_y - height / 2)
  79. classIds.append(classId)
  80. confidences.append(float(confidence))
  81. boxes.append([left, top, width, height])
  82. # Perform non maximum suppression to eliminate redundant overlapping boxes with
  83. # lower confidences.
  84. indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold)
  85. for i in indices:
  86. i = i[0]
  87. box = boxes[i]
  88. left = box[0]
  89. top = box[1]
  90. width = box[2]
  91. height = box[3]
  92. drawPred(classIds[i], confidences[i], left, top, left + width, top + height)
  93. # Process inputs
  94. winName = 'Deep learning object detection in OpenCV'
  95. cv.namedWindow(winName, cv.WINDOW_NORMAL)
  96. outputFile = "yolo_out_py.avi"
  97. if (args.image):
  98. # Open the image file
  99. if not os.path.isfile(args.image):
  100. print("Input image file ", args.image, " doesn't exist")
  101. sys.exit(1)
  102. cap = cv.VideoCapture(args.image)
  103. outputFile = args.image[:-4]+'_yolo_out_py.jpg'
  104. elif (args.video):
  105. # Open the video file
  106. if not os.path.isfile(args.video):
  107. print("Input video file ", args.video, " doesn't exist")
  108. sys.exit(1)
  109. cap = cv.VideoCapture(args.video)
  110. outputFile = args.video[:-4]+'_yolo_out_py.avi'
  111. else:
  112. # Webcam input
  113. cap = cv.VideoCapture(0)
  114. # Get the video writer initialized to save the output video
  115. if (not args.image):
  116. vid_writer = cv.VideoWriter(outputFile, cv.VideoWriter_fourcc('M','J','P','G'), 30, (round(cap.get(cv.CAP_PROP_FRAME_WIDTH)),round(cap.get(cv.CAP_PROP_FRAME_HEIGHT))))
  117. while cv.waitKey(1) < 0:
  118. # get frame from the video
  119. hasFrame, frame = cap.read()
  120. # Stop the program if reached end of video
  121. if not hasFrame:
  122. print("Done processing !!!")
  123. print("Output file is stored as ", outputFile)
  124. cv.waitKey(3000)
  125. # Release device
  126. cap.release()
  127. break
  128. # Create a 4D blob from a frame.
  129. blob = cv.dnn.blobFromImage(frame, 1/255, (inpWidth, inpHeight), [0,0,0], 1, crop=False)
  130. # Sets the input to the network
  131. net.setInput(blob)
  132. # Runs the forward pass to get output of the output layers
  133. outs = net.forward(getOutputsNames(net))
  134. # Remove the bounding boxes with low confidence
  135. postprocess(frame, outs)
  136. # Put efficiency information. The function getPerfProfile returns the overall time for inference(t) and the timings for each of the layers(in layersTimes)
  137. t, _ = net.getPerfProfile()
  138. label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
  139. cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))
  140. # Write the frame with the detection boxes
  141. if (args.image):
  142. cv.imwrite(outputFile, frame.astype(np.uint8))
  143. else:
  144. vid_writer.write(frame.astype(np.uint8))
  145. cv.imshow(winName, frame)