How to run object detection with OpenCV DNN

Object detection turns an image into class labels, confidence scores, and boxes that locate each detected object. OpenCV DNN is useful when a trained detector already exists and the application only needs local CPU inference without a full training framework.

The OpenCV DNN module runs inference rather than training. A Python script can load a YOLOv8 ONNX model with readNetFromONNX, prepare an input blob with blobFromImage, run forward, apply non-maximum suppression, and draw the remaining detections back onto the source image.

Keep the model, label file, confidence threshold, and test image together so each result can be traced back to the detector that produced it. A successful smoke test reports expected object classes and writes an annotated image that another OpenCV step can open.

Steps to run object detection with OpenCV DNN:

  1. Open a Python environment with OpenCV and NumPy available.

    Use the same environment to download the files, save the script, and run inference so relative paths resolve from one working directory.

  2. Create working directories for the test image, model, labels, and annotated output.
    $ mkdir -p input models output
  3. Download the YOLOv8 ONNX model referenced by the OpenCV DNN sample configuration.
    $ curl --fail --location --output models/yolov8n.onnx \
      https://github.com/CVHub520/X-AnyLabeling/releases/download/v0.1.0/yolov8n.onnx

    The sample model uses 640×640 input, RGB channel order, and a 1/255 scale factor.

  4. Download the COCO class labels used by the YOLO sample.
    $ curl --fail --location --output models/coco.names \
      https://raw.githubusercontent.com/opencv/opencv/4.x/samples/data/dnn/object_detection_classes_yolo.txt
  5. Download the OpenCV test image.
    $ curl --fail --location --output input/dog.png \
      https://raw.githubusercontent.com/opencv/opencv_extra/4.x/testdata/dnn/dog416.png

    Replace input/dog.png with any readable image when validating a different detector or scene.

  6. Save the detection script as detect_objects_dnn.py.
    detect_objects_dnn.py
    from argparse import ArgumentParser
    from pathlib import Path
     
    import cv2 as cv
    import numpy as np
     
     
    def load_classes(path):
        with open(path, "r", encoding="utf-8") as handle:
            return [line.strip() for line in handle if line.strip()]
     
     
    def letterbox(image, size):
        height, width = image.shape[:2]
        scale = min(size / width, size / height)
        resized_width = int(round(width * scale))
        resized_height = int(round(height * scale))
        resized = cv.resize(image, (resized_width, resized_height), interpolation=cv.INTER_LINEAR)
     
        canvas = np.full((size, size, 3), 114, dtype=np.uint8)
        pad_x = (size - resized_width) // 2
        pad_y = (size - resized_height) // 2
        canvas[pad_y:pad_y + resized_height, pad_x:pad_x + resized_width] = resized
        return canvas, scale, pad_x, pad_y
     
     
    def yolo_rows(output):
        predictions = output[0]
        if predictions.ndim == 3:
            predictions = predictions[0]
        if predictions.shape[0] < predictions.shape[1]:
            predictions = predictions.T
        return predictions
     
     
    def clip_box(x, y, width, height, image_width, image_height):
        x = max(0, min(int(round(x)), image_width - 1))
        y = max(0, min(int(round(y)), image_height - 1))
        width = max(1, min(int(round(width)), image_width - x))
        height = max(1, min(int(round(height)), image_height - y))
        return [x, y, width, height]
     
     
    parser = ArgumentParser(description="Run YOLOv8 object detection with OpenCV DNN.")
    parser.add_argument("image", help="Input image path.")
    parser.add_argument("model", help="YOLOv8 ONNX model path.")
    parser.add_argument("classes", help="Class names file, one label per line.")
    parser.add_argument("output", help="Annotated output image path.")
    parser.add_argument("--confidence", type=float, default=0.35, help="Confidence threshold.")
    parser.add_argument("--nms", type=float, default=0.45, help="Non-maximum suppression threshold.")
    parser.add_argument("--input-size", type=int, default=640, help="Square DNN input size.")
    parser.add_argument("--max-print", type=int, default=5, help="Maximum kept detections to print.")
    args = parser.parse_args()
     
    image = cv.imread(args.image, cv.IMREAD_COLOR)
    if image is None:
        raise SystemExit(f"Could not read image: {args.image}")
     
    classes = load_classes(args.classes)
    net = cv.dnn.readNetFromONNX(args.model)
    net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
    net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
     
    input_image, scale, pad_x, pad_y = letterbox(image, args.input_size)
    blob = cv.dnn.blobFromImage(input_image, 1 / 255.0, (args.input_size, args.input_size), swapRB=True, crop=False)
    net.setInput(blob)
    rows = yolo_rows(net.forward())
     
    image_height, image_width = image.shape[:2]
    boxes = []
    scores = []
    class_ids = []
     
    for row in rows:
        class_scores = row[4:]
        class_id = int(np.argmax(class_scores))
        score = float(class_scores[class_id])
        if score < args.confidence:
            continue
     
        center_x, center_y, box_width, box_height = row[:4]
        x = (center_x - box_width / 2 - pad_x) / scale
        y = (center_y - box_height / 2 - pad_y) / scale
        width = box_width / scale
        height = box_height / scale
     
        boxes.append(clip_box(x, y, width, height, image_width, image_height))
        scores.append(score)
        class_ids.append(class_id)
     
    indices = cv.dnn.NMSBoxes(boxes, scores, args.confidence, args.nms)
    indices = np.array(indices).reshape(-1).tolist() if len(indices) else []
    indices = sorted(indices, key=lambda item: scores[item], reverse=True)
     
    for index in indices:
        x, y, width, height = boxes[index]
        label = classes[class_ids[index]] if class_ids[index] < len(classes) else f"class_{class_ids[index]}"
        cv.rectangle(image, (x, y), (x + width, y + height), (0, 180, 0), 2)
        cv.putText(image, f"{label} {scores[index]:.2f}", (x, max(20, y - 8)), cv.FONT_HERSHEY_SIMPLEX, 0.55, (0, 180, 0), 2)
     
    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    if not cv.imwrite(str(output_path), image):
        raise SystemExit(f"Could not write image: {output_path}")
     
    print(f"model={args.model} backend=opencv target=cpu")
    print(f"image={args.image} shape={image_width}x{image_height}")
    print(f"raw_candidates={len(boxes)} kept_detections={len(indices)}")
    for index in indices[:args.max_print]:
        x, y, width, height = boxes[index]
        label = classes[class_ids[index]] if class_ids[index] < len(classes) else f"class_{class_ids[index]}"
        print(f"{label} confidence={scores[index]:.2f} box={x},{y},{width},{height}")
    print(f"wrote={output_path}")
  7. Run the detector against the sample image.
    $ python3 detect_objects_dnn.py input/dog.png models/yolov8n.onnx models/coco.names output/dog-detections.jpg --confidence 0.35
    model=models/yolov8n.onnx backend=opencv target=cpu
    image=input/dog.png shape=416x416
    raw_candidates=30 kept_detections=3
    dog confidence=0.83 box=71,161,96,230
    bicycle confidence=0.81 box=67,98,240,206
    truck confidence=0.58 box=253,54,122,70
    wrote=output/dog-detections.jpg

    Use class labels that match the model. A mismatched label file can make correct boxes appear with wrong object names.

  8. Check that OpenCV can open the annotated output image.
    $ python3 - <<'PY'
    import cv2 as cv
    
    image = cv.imread("output/dog-detections.jpg")
    print(f"annotated_image_shape={image.shape[1]}x{image.shape[0]} channels={image.shape[2]}")
    PY
    annotated_image_shape=416x416 channels=3