How to run pose estimation/alphapose with video instead of cam?

So I don’t have access to a webcam, is it possible to pass a video through pose estimation instead? Is there any examples lying around?

Please try this code.

import time

import cv2

import mxnet as mx

import gluoncv

from gluoncv.model_zoo import get_model

from gluoncv.data.transforms.pose import detector_to_alpha_pose, heatmap_to_coord

from gluoncv.utils.viz import cv_plot_image, cv_plot_keypoints

from gluoncv import utils

url = 'https://github.com/bryanyzhu/tiny-ucf101/raw/master/v_Basketball_g01_c01.avi'

video_fname = utils.download(url)

ctx = mx.cpu()

detector = get_model('ssd_512_mobilenet1.0_coco', pretrained=True, ctx=ctx)

detector.reset_class(classes=['person'], reuse_weights={'person':'person'})

detector.hybridize()

estimator = get_model('alpha_pose_resnet101_v1b_coco', pretrained=True, ctx=ctx)

estimator.hybridize()

cap = cv2.VideoCapture(video_fname)

fps = cap.get(cv2.CAP_PROP_FPS)

start = time.time()

while(True):

    end = time.time()

    sec = end-start

    cap.set(cv2.CAP_PROP_POS_FRAMES, round(fps * sec))

    ret, frame = cap.read()

    

    if(ret):

        frame = mx.nd.array(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)).astype('uint8')

        x, frame = gluoncv.data.transforms.presets.ssd.transform_test(frame, short=240)

        x = x.as_in_context(ctx)

        class_IDs, scores, bounding_boxs = detector(x)

        pose_input, upscale_bbox = detector_to_alpha_pose(frame, class_IDs, scores, bounding_boxs)

    

        if upscale_bbox is not None:

            predicted_heatmap = estimator(pose_input.as_in_context(ctx))

            pred_coords, confidence = heatmap_to_coord(predicted_heatmap, upscale_bbox)

            img = cv_plot_keypoints(frame, pred_coords, confidence, class_IDs, bounding_boxs, scores,

                                    box_thresh=0.5, keypoint_thresh=0.2)

            cv_plot_image(img)

        else:

            cv_plot_image(frame)

    else:

        break

    

    if cv2.waitKey(1) == 27:

        break

cap.release()

cv2.destroyAllWindows()

That worked beautifully. Another question: say if I wanted to save the video, how would I do that?

If you want to save the result to a file from your video file(not webcam), you can use all frames in your video file.
This is a simple example.

import cv2

import mxnet as mx

import gluoncv

from gluoncv.model_zoo import get_model

from gluoncv.data.transforms.pose import detector_to_alpha_pose, heatmap_to_coord

from gluoncv.utils.viz import cv_plot_keypoints

from gluoncv import utils

url = 'https://github.com/bryanyzhu/tiny-ucf101/raw/master/v_Basketball_g01_c01.avi'

video_fname = utils.download(url)

cap = cv2.VideoCapture(video_fname)

fps = int(cap.get(cv2.CAP_PROP_FPS))

width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))

height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Define the codec and create VideoWriter object

fourcc = cv2.VideoWriter_fourcc('M','J','P','G')

out = cv2.VideoWriter('output.avi',fourcc, fps, (width,height))

ctx = mx.cpu()

detector = get_model('ssd_512_mobilenet1.0_coco', pretrained=True, ctx=ctx)

detector.reset_class(classes=['person'], reuse_weights={'person':'person'})

detector.hybridize()

estimator = get_model('alpha_pose_resnet101_v1b_coco', pretrained=True, ctx=ctx)

estimator.hybridize()

while(True):

    ret, frame = cap.read()

    if(ret):

        frame = mx.nd.array(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)).astype('uint8')

        x, frame = gluoncv.data.transforms.presets.ssd.transform_test(frame, short=240)

        x = x.as_in_context(ctx)

        class_IDs, scores, bounding_boxs = detector(x)

        pose_input, upscale_bbox = detector_to_alpha_pose(frame, class_IDs, scores, bounding_boxs)

        if upscale_bbox is not None:

            predicted_heatmap = estimator(pose_input.as_in_context(ctx))

            pred_coords, confidence = heatmap_to_coord(predicted_heatmap, upscale_bbox)

            img = cv_plot_keypoints(frame, pred_coords, confidence, class_IDs, bounding_boxs, scores,

                                    box_thresh=0.5, keypoint_thresh=0.2)

            out.write(img)

        else:

            out.write(frame)

    else:

        break

    if cv2.waitKey(1) == 27:

        break

cap.release()

out.release()

cv2.destroyAllWindows()

Hope this helps.

With this I managed to save the video, but the problem I get is that the color is wrong, and the video length doesn’t match. The output video tends to be shorter and much faster. I fixed the color issue by converting the frame back to BGR before writing it:

cap = cv2.VideoCapture(folder_raw + video_fname)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

frame_width = int(cap.get(3))
frame_height = int(cap.get(4))

size = (frame_width, frame_height)

fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')

result = cv2.VideoWriter(folder_result + 'demo_' + video_fname,
                    fourcc, fps, size)

start = time.time()

while(True):
    end = time.time()
    sec = end-start
    cap.set(cv2.CAP_PROP_POS_FRAMES, round(fps * sec))
    ret, frame = cap.read()

    if(ret):
        frame = mx.nd.array(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)).astype('uint8')
        x, frame = gluoncv.data.transforms.presets.ssd.transform_test(frame, short=240)
        x = x.as_in_context(context)
    
        class_IDs, scores, bounding_boxs = detector(x)
        pose_input, upscale_bbox = detector_to_alpha_pose(frame,
                                                          class_IDs,
                                                          scores,
                                                          bounding_boxs)
    
        if upscale_bbox is not None:
            predicted_heatmap = estimator(pose_input.as_in_context(context))
            pred_coords, confidence = heatmap_to_coord(predicted_heatmap, upscale_bbox)
            img = cv_plot_keypoints(frame, pred_coords,
                                                   confidence, class_IDs,
                                                   bounding_boxs, scores,
                                                   box_thresh=0.5, keypoint_thresh=0.1)
        
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
            img = cv2.resize(img, size)
            result.write(img)
        else:
            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            result.write(frame)
    else:
        break
    
    if cv2.waitKey(1) == 27: # ESC to stop
        break

cap.release()
result.release()

cv2.destroyAllWindows()
print("The video was successfully saved.")

In your code, you don’t use all fraems but set fps to the one of your original video.

fps = int(cap.get(cv2.CAP_PROP_FPS))
result = cv2.VideoWriter(folder_result + 'demo_' + video_fname,
                    fourcc, fps, size)

So the output video becomes shorter, I think.
You should use all frames in your original video.

This line is not needed.
cap.set(cv2.CAP_PROP_POS_FRAMES, round(fps * sec))

That helped, the output video matches better now, but still not 100% correct. I sent you a message directly here, can you take a look and come back to me? Much appriciated.