Autonomy_Software/YOLOModel_8hpp_source.html

#ifndef YOLO_MODEL_HPP

#define YOLO_MODEL_HPP


#include "../../AutonomyConstants.h"

#include "../../interfaces/TensorflowTPU.hpp"


#include <nlohmann/json.hpp>

#include <opencv2/opencv.hpp>

#include <torch/script.h>

#include <torch/torch.h>


namespace yolomodel

{


    struct Detection

    {

        public:

            // Define public struct attributes.


            int nClassID;               // The class index of the object. Dependent on class order when trained.

            std::string szClassName;    // The class name of the object. This is dependent on the class names used when training.

            float fConfidence;          // The detection confidence of the object.

            cv::Rect cvBoundingBox;     // An object used to access the dimensions and other properties of the objects bounding box.

    };


    inline void NonMaxSuppression(std::vector<Detection>& vObjects,

                                  std::vector<int>& vClassIDs,

                                  std::vector<float>& vClassConfidences,

                                  std::vector<cv::Rect>& vBoundingBoxes,

                                  float fMinObjectConfidence,

                                  float fNMSThreshold)

    {

        // Create instance variables.

        std::vector<int> vNMSValidIndices;


        // Perform Non-Max Suppression using OpenCV's implementation.

        cv::dnn::NMSBoxes(vBoundingBoxes, vClassConfidences, fMinObjectConfidence, fNMSThreshold, vNMSValidIndices);


        // Loop through each valid index.

        for (int nValidIndex : vNMSValidIndices)

        {

            // Create new Detection struct.

            Detection stNewDetection;

            // Repackage prediction data into easy-to-use struct.

            stNewDetection.nClassID      = vClassIDs[nValidIndex];

            stNewDetection.fConfidence   = vClassConfidences[nValidIndex];

            stNewDetection.cvBoundingBox = vBoundingBoxes[nValidIndex];


            // Append new object detection to objects vector.

            vObjects.emplace_back(stNewDetection);

        }

    }


    inline void DrawDetections(cv::Mat& cvInputFrame, std::vector<Detection>& vObjects)

    {

        // Loop through each detection.

        for (Detection stObject : vObjects)

        {

            // Calculate the hue value based on the class ID.

            int nHue = static_cast<int>(stObject.nClassID % 256);

            // Set saturation and value to 1.0 for full intensity colors.

            int nSaturation = 255;

            int nValue      = 255;


            // Convert HSV to RGB

            cv::Mat cvHSV(1, 1, CV_8UC3, cv::Scalar(nHue, nSaturation, nValue));

            cv::cvtColor(cvHSV, cvHSV, cv::COLOR_HSV2BGR);

            // Extract the RGB values

            cv::Vec3b cvConvertedValues = cvHSV.at<cv::Vec3b>(0, 0);

            cv::Scalar cvBoxColor(cvConvertedValues[2], cvConvertedValues[1], cvConvertedValues[0]);


            // Draw bounding box onto image.

            cv::rectangle(cvInputFrame, stObject.cvBoundingBox, cvBoxColor, 2);

            // Draw classID background box onto image.

            cv::rectangle(cvInputFrame,

                          cv::Point(stObject.cvBoundingBox.x, stObject.cvBoundingBox.y - 20),

                          cv::Point(stObject.cvBoundingBox.x + stObject.cvBoundingBox.width, stObject.cvBoundingBox.y),

                          cvBoxColor,

                          cv::FILLED);

            // Draw class text onto image.

            cv::putText(cvInputFrame,

                        std::to_string(stObject.nClassID) + " " + std::to_string(stObject.fConfidence),

                        cv::Point(stObject.cvBoundingBox.x, stObject.cvBoundingBox.y - 5),

                        cv::FONT_HERSHEY_SIMPLEX,

                        0.5,

                        cv::Scalar(255, 255, 255));

        }

    }


    namespace tensorflow

    {


        struct InputTensorDimensions

        {

            public:

                // Define public struct attributes.


                int nHeight;            // The height of the input image.

                int nWidth;             // The width of the input image.

                int nChannels;          // The number of channels of the input image.

                int nTensorIndex;       // The index of the tensor used to retrieve it from the interpreter.

                int nQuantZeroPoint;    // The value of the quantized input tensor that represents zero.

                float fQuantScale;      // The multiplier of each value to scale to meaningful numbers. (quantization)

        };


        struct OutputTensorDimensions

        {

            public:

                // Define public struct attributes.


                int nAnchors;                      // Determined from the trained image size of the model.

                int nObjectnessLocationClasses;    // The number of data points of each anchor. Each anchor contains a vector 5+nc (YOLOv5) or 4+nc (YOLOv8) long, where

                                                   // nc is the number of classes The model has.

                int nTensorIndex;                  // The index of the tensor used to retrieve it from the interpreter.

                int nQuantZeroPoint;               // The value of the quantized output tensor that represents zero.

                float fQuantScale;                 // The multiplier of each value to scale to meaningful numbers. (Undo quantization)

        };


        class TPUInterpreter : public TensorflowTPU<std::vector<std::vector<Detection>>, cv::Mat>

        {

            public:

                // Declare public enums that are specific to and used withing this class.


                // Declare public methods and member variables.


                TPUInterpreter(std::string szModelPath,

                               PerformanceModes ePowerMode         = PerformanceModes::eHigh,

                               unsigned int unMaxBulkInQueueLength = 32,

                               bool bUSBAlwaysDFU                  = false) :

                    TensorflowTPU<std::vector<std::vector<Detection>>, cv::Mat>(szModelPath, ePowerMode, unMaxBulkInQueueLength, bUSBAlwaysDFU)


                {}


                ~TPUInterpreter()

                {

                    // Nothing to destroy.

                }


                std::vector<std::vector<Detection>> Inference(const cv::Mat& cvInputFrame,

                                                              const float fMinObjectConfidence = 0.85,

                                                              const float fNMSThreshold        = 0.6) override

                {

                    // Create instance variables.

                    std::vector<std::vector<Detection>> vTensorObjectOutputs;


                    // Get the input tensor shape for the model.

                    InputTensorDimensions stInputDimensions = this->GetInputShape(m_pInterpreter->inputs()[0]);


                    // Copy given frame to class member variable.

                    m_cvFrame = cvInputFrame;


                    // Check if model is open and device is ready.

                    if (m_bDeviceOpened && m_pEdgeTPUContext->IsReady())

                    {

                        // Check if the image has the correct type.

                        if (m_cvFrame.type() != CV_8UC3)

                        {

                            // Convert image to unsigned int8 image.

                            m_cvFrame.convertTo(m_cvFrame, CV_8UC3);

                        }


                        // Check if the input image matches the input tensor shape.

                        if (m_cvFrame.rows != stInputDimensions.nHeight || m_cvFrame.cols != stInputDimensions.nWidth)

                        {

                            // Resize the image, and store a local copy of it.

                            cv::resize(m_cvFrame,

                                       m_cvFrame,

                                       cv::Size(stInputDimensions.nWidth, stInputDimensions.nHeight),

                                       constants::BASICCAM_RESIZE_INTERPOLATION_METHOD);

                        }


                        // Create a vector to store reshaped input image in 1 dimension.

                        std::vector<int8_t> vInputData(m_cvFrame.data,

                                                       m_cvFrame.data + (static_cast<unsigned long>(m_cvFrame.cols) * m_cvFrame.rows * m_cvFrame.elemSize()));

                        // Quantize input data.

                        // for (long unsigned int nIter = 0; nIter < vInputData.size(); ++nIter)

                        // {

                        //     // Quantize value.

                        //     vInputData[nIter] = std::round((vInputData[nIter] - 128) / stInputDimensions.fQuantScale) + stInputDimensions.nQuantZeroPoint;

                        //     // vInputData[nIter] = vInputData[nIter] - 128;

                        // }

                        // Retrieve a new input tensor from the TPU interpreter and copy data to it. This tensor is automatically quantized because it is typed.

                        TfLiteTensor* pInputTensor = m_pInterpreter->tensor(stInputDimensions.nTensorIndex);

                        std::memcpy(pInputTensor->data.raw, vInputData.data(), vInputData.size());


                        // Run inference on the EdgeTPU.

                        if (m_pInterpreter->Invoke() != kTfLiteOk)

                        {

                            // Submit logger message.

                            LOG_WARNING(logging::g_qSharedLogger,

                                        "Inferencing failed on an image for model {} with device {} ({})",

                                        m_szModelPath,

                                        m_tpuDevice.path,

                                        this->DeviceTypeToString(m_tpuDevice.type));

                        }

                        else

                        {

                            // Create separate vectors for storing class confidences, bounding boxes, and classIDs.

                            std::vector<int> vClassIDs;

                            std::vector<float> vClassConfidences;

                            std::vector<cv::Rect> vBoundingBoxes;

                            // Create vector for storing all detections for this tensor output.

                            std::vector<Detection> vObjects;


                            // Get output indices for output tensors.

                            for (int nTensorIndex : m_pInterpreter->outputs())

                            {

                                // Clear prediction data vectors.

                                vClassIDs.clear();

                                vClassConfidences.clear();

                                vBoundingBoxes.clear();

                                // Clear object detections vector.

                                vObjects.clear();


                                /*

                                    Check if the output tensor has a YOLOv5 format.

                                */

                                // Get the tensor output shape details.

                                OutputTensorDimensions stOutputDimensions = this->GetOutputShape(nTensorIndex);

                                // Calculate the general stride sizes for YOLO based on input tensor shape.

                                int nImgSize  = stInputDimensions.nHeight;

                                int nP3Stride = std::pow((nImgSize / 8), 2);

                                int nP4Stride = std::pow((nImgSize / 16), 2);

                                int nP5Stride = std::pow((nImgSize / 32), 2);

                                // Calculate the proper prediction length for different YOLO versions.

                                int nYOLOv5AnchorsPerGridPoint = 3;

                                int nYOLOv8AnchorsPerGridPoint = 1;

                                int nYOLOv5TotalPredictionLength =

                                    (nP3Stride * nYOLOv5AnchorsPerGridPoint) + (nP4Stride * nYOLOv5AnchorsPerGridPoint) + (nP5Stride * nYOLOv5AnchorsPerGridPoint);

                                int nYOLOv8TotalPredictionLength =

                                    (nP3Stride * nYOLOv8AnchorsPerGridPoint) + (nP4Stride * nYOLOv8AnchorsPerGridPoint) + (nP5Stride * nYOLOv8AnchorsPerGridPoint);


                                // Output tensor is YOLOv5 format.

                                if (stOutputDimensions.nAnchors == nYOLOv5TotalPredictionLength)

                                {

                                    // Parse inferenced output from tensor.

                                    this->ParseTensorOutputYOLOv5(nTensorIndex,

                                                                  vClassIDs,

                                                                  vClassConfidences,

                                                                  vBoundingBoxes,

                                                                  fMinObjectConfidence,

                                                                  cvInputFrame.cols,

                                                                  cvInputFrame.rows);

                                }

                                // Output tensor is YOLOv8 format.

                                else if (stOutputDimensions.nAnchors == nYOLOv8TotalPredictionLength)

                                {

                                    // Parse inferenced output from tensor.

                                    this->ParseTensorOutputYOLOv8(nTensorIndex,

                                                                  vClassIDs,

                                                                  vClassConfidences,

                                                                  vBoundingBoxes,

                                                                  fMinObjectConfidence,

                                                                  cvInputFrame.cols,

                                                                  cvInputFrame.rows);

                                }


                                // Perform NMS to filter out bad/duplicate detections.

                                NonMaxSuppression(vObjects, vClassIDs, vClassConfidences, vBoundingBoxes, fMinObjectConfidence, fNMSThreshold);


                                // Append object detections to the tensor outputs vector.

                                vTensorObjectOutputs.emplace_back(vObjects);

                            }

                        }

                    }

                    else

                    {

                        // Submit logger message.

                        LOG_WARNING(logging::g_qSharedLogger,

                                    "Inferencing failed on an image for model {} with device {} ({})",

                                    m_szModelPath,

                                    m_tpuDevice.path,

                                    this->DeviceTypeToString(m_tpuDevice.type));

                    }


                    return vTensorObjectOutputs;

                }


            private:

                // Declare private methods.


                void ParseTensorOutputYOLOv5(int nOutputIndex,

                                             std::vector<int>& vClassIDs,

                                             std::vector<float>& vClassConfidences,

                                             std::vector<cv::Rect>& vBoundingBoxes,

                                             float fMinObjectConfidence,

                                             int nOriginalFrameWidth,

                                             int nOriginalFrameHeight)

                {

                    // Retrieve output tensor from interpreter.

                    TfLiteTensor* tfOutputTensor = m_pInterpreter->tensor(nOutputIndex);

                    // Get output tensor shape.

                    OutputTensorDimensions stOutputDimensions = this->GetOutputShape(nOutputIndex);

                    // Create vector for storing temporary values for this prediction.

                    std::vector<float> vGridPrediction;

                    // Resize the Grid prediction vector to match the number of classes + bounding_box + objectness score.

                    vGridPrediction.resize(stOutputDimensions.nObjectnessLocationClasses);


                    /*

                       Loop through each grid cell output of the model output and filter out objects that don't meet conf thresh.

                       Then, repackage into nice detection structs.

                       For YOLOv5, you divide your image size, i.e. 640 by the P3, P4, P5 output strides of 8, 16, 32 to arrive at grid sizes

                       of 80x80, 40x40, 20x20. Each grid point has 3 anchors by default (anchor box values: small, medium, large), and each anchor contains a vector 5 +

                       nc long, where nc is the number of classes the model has. So for a 640 image, the output tensor will be [1, 25200, 85]

                    */

                    for (int nIter = 0; nIter < stOutputDimensions.nAnchors; ++nIter)

                    {

                        // Get objectness confidence. This is the 5th value for each grid/anchor prediction. (4th index)

                        float fObjectnessConfidence =

                            (tfOutputTensor->data.uint8[(nIter * stOutputDimensions.nObjectnessLocationClasses) + 4] - stOutputDimensions.nQuantZeroPoint) *

                            stOutputDimensions.fQuantScale;


                        // Check if the object confidence is greater than or equal to the threshold.

                        if (fObjectnessConfidence >= fMinObjectConfidence)

                        {

                            // Loop through the number of object info and class confidences in the 2nd dimension.

                            // Predictions have format {center_x, center_y, width, height, object_conf, class0_conf, class1_conf, ...}

                            for (int nJter = 0; nJter < stOutputDimensions.nObjectnessLocationClasses; ++nJter)

                            {

                                // Repackage value into more usable vector. Also undo quantization the data.

                                vGridPrediction[nJter] =

                                    (tfOutputTensor->data.uint8[(nIter * stOutputDimensions.nObjectnessLocationClasses) + nJter] - stOutputDimensions.nQuantZeroPoint) *

                                    stOutputDimensions.fQuantScale;

                            }


                            // Find class ID based on which class confidence has the highest score.

                            std::vector<float>::iterator pStartIterator = vGridPrediction.begin() + 5;

                            std::vector<float>::iterator pMaxConfidence = std::max_element(pStartIterator, vGridPrediction.end());

                            int nClassID                                = std::distance(pStartIterator, pMaxConfidence);

                            // Get prediction confidence for class ID.

                            float fClassConfidence = vGridPrediction[nClassID + 5];

                            // Scale bounding box to match original input image size.

                            cv::Rect cvBoundingBox;

                            int nCenterX = vGridPrediction[0] * nOriginalFrameWidth;

                            int nCenterY = vGridPrediction[1] * nOriginalFrameHeight;

                            int nWidth   = vGridPrediction[2] * nOriginalFrameWidth;

                            int nHeight  = vGridPrediction[3] * nOriginalFrameHeight;

                            // Check if the width and height of the object are greater than zero.

                            if (nWidth > 0 && nHeight > 0)

                            {

                                // Repackaged bounding box data to be more readable.

                                cvBoundingBox.x      = int(nCenterX - (0.5 * nWidth));     // Rect.x is the top-left corner not center point.

                                cvBoundingBox.y      = int(nCenterY - (0.5 * nHeight));    // Rect.y is the top-left corner not center point.

                                cvBoundingBox.width  = nWidth;

                                cvBoundingBox.height = nHeight;

                                // Add data to vectors.

                                vClassIDs.emplace_back(nClassID);

                                vClassConfidences.emplace_back(fClassConfidence);

                                vBoundingBoxes.emplace_back(cvBoundingBox);

                            }

                        }

                    }

                }


                void ParseTensorOutputYOLOv8(int nOutputIndex,

                                             std::vector<int>& vClassIDs,

                                             std::vector<float>& vClassConfidences,

                                             std::vector<cv::Rect>& vBoundingBoxes,

                                             float fMinObjectConfidence,

                                             int nOriginalFrameWidth,

                                             int nOriginalFrameHeight)

                {

                    // Retrieve output tensor from interpreter.

                    TfLiteTensor* tfOutputTensor = m_pInterpreter->tensor(nOutputIndex);

                    // Get output tensor shape.

                    OutputTensorDimensions stOutputDimensions = this->GetOutputShape(nOutputIndex);

                    // Create vector for storing temporary values for this prediction.

                    std::vector<float> vGridPrediction;

                    // Resize the Grid prediction vector to match the number of classes + bounding_box + objectness score.

                    vGridPrediction.resize(stOutputDimensions.nObjectnessLocationClasses);


                    /*

                        Loop through each grid cell output of the model output and filter out objects that don't meet conf thresh.

                        Then, repackage into nice detection structs.

                        For YOLOv8, you divide your image size, i.e. 640 by the P3, P4, P5 output strides of 8, 16, 32 to arrive at grid sizes

                        of 80x80, 40x40, 20x20. Each grid point has 1 anchor, and each anchor contains a vector 4 + nc long, where nc is the number

                        of classes the model has. So for a 640 image, the output tensor will be [1, 84, 8400] (80 classes). Notice how the larger dimensions is swapped

                        when compared to YOLOv8.

                    */

                    for (int nIter = 0; nIter < stOutputDimensions.nAnchors; ++nIter)

                    {

                        // Loop through the number of object info and class confidences in the 2nd dimension.

                        // Predictions have format {center_x, center_y, width, height, class0_conf, class1_conf, ...}

                        std::string szTest = "";

                        for (int nJter = 0; nJter < stOutputDimensions.nObjectnessLocationClasses; ++nJter)

                        {

                            // Repackage values into more usable vector. Also undo quantization the data.

                            vGridPrediction[nJter] = (tfOutputTensor->data.int8[nIter + (nJter * stOutputDimensions.nAnchors)] - stOutputDimensions.nQuantZeroPoint) *

                                                     stOutputDimensions.fQuantScale;

                        }


                        // Find class ID based on which class confidence has the highest score.

                        std::vector<float>::iterator pStartIterator = vGridPrediction.begin() + 4;

                        std::vector<float>::iterator pMaxConfidence = std::max_element(pStartIterator, vGridPrediction.end());

                        int nClassID                                = std::distance(pStartIterator, pMaxConfidence);

                        // Get prediction confidence for class ID.

                        float fClassConfidence = vGridPrediction[nClassID + 4];


                        // Check if class confidence meets threshold.

                        if (fClassConfidence >= fMinObjectConfidence)

                        {

                            // Scale bounding box to match original input image size.

                            cv::Rect cvBoundingBox;

                            int nCenterX = vGridPrediction[0] * nOriginalFrameWidth;

                            int nCenterY = vGridPrediction[1] * nOriginalFrameHeight;

                            int nWidth   = vGridPrediction[2] * nOriginalFrameWidth;

                            int nHeight  = vGridPrediction[3] * nOriginalFrameHeight;

                            // Repackaged bounding box data to be more readable.

                            cvBoundingBox.x      = int(nCenterX - (0.5 * nWidth));     // Rect.x is the top-left corner not center point.

                            cvBoundingBox.y      = int(nCenterY - (0.5 * nHeight));    // Rect.y is the top-left corner not center point.

                            cvBoundingBox.width  = nWidth;

                            cvBoundingBox.height = nHeight;

                            // Add data to vectors.

                            vClassIDs.emplace_back(nClassID);

                            vClassConfidences.emplace_back(fClassConfidence);

                            vBoundingBoxes.emplace_back(cvBoundingBox);

                        }

                    }

                }


                InputTensorDimensions GetInputShape(const int nTensorIndex = 0)

                {

                    // Create instance variables.

                    InputTensorDimensions stInputDimensions = {0, 0, 0, 0, 0, 0};


                    // Check if interpreter has been built.

                    if (m_bDeviceOpened)

                    {

                        // Get the desired input tensor shape of the model.

                        TfLiteTensor* tfInputTensor  = m_pInterpreter->tensor(nTensorIndex);

                        TfLiteIntArray* tfDimensions = tfInputTensor->dims;


                        // Package dimensions into struct.

                        stInputDimensions.nHeight      = tfDimensions->data[1];

                        stInputDimensions.nWidth       = tfDimensions->data[2];

                        stInputDimensions.nChannels    = tfDimensions->data[3];

                        stInputDimensions.nTensorIndex = nTensorIndex;

                        // Get the quantization zero point and scale for output tensor.

                        stInputDimensions.nQuantZeroPoint = tfInputTensor->params.zero_point;

                        stInputDimensions.fQuantScale     = tfInputTensor->params.scale;

                    }


                    return stInputDimensions;

                }


                OutputTensorDimensions GetOutputShape(const int nTensorIndex = 0)

                {

                    // Create instance variables.

                    OutputTensorDimensions stOutputDimensions = {0, 0, 0, 0, 0};


                    // Check if interpreter has been built.

                    if (m_bDeviceOpened)

                    {

                        // Get the desired output tensor shape of the model.

                        TfLiteTensor* tfOutputTensor = m_pInterpreter->tensor(nTensorIndex);

                        TfLiteIntArray* tfDimensions = tfOutputTensor->dims;


                        // Package dimensions into struct. Assume anchors will always be the longer dimension.

                        stOutputDimensions.nAnchors                   = std::max(tfDimensions->data[1], tfDimensions->data[2]);

                        stOutputDimensions.nObjectnessLocationClasses = std::min(tfDimensions->data[1], tfDimensions->data[2]);

                        stOutputDimensions.nTensorIndex               = nTensorIndex;

                        // Get the quantization zero point and scale for output tensor.

                        stOutputDimensions.nQuantZeroPoint = tfOutputTensor->params.zero_point;

                        stOutputDimensions.fQuantScale     = tfOutputTensor->params.scale;

                    }


                    return stOutputDimensions;

                }


                // Declare private member variables.

                cv::Mat m_cvFrame;

        };


    }    // namespace tensorflow


    namespace pytorch

    {


        class PyTorchInterpreter

        {

            public:

                // Declare public enums that are specific to and used within this class.

                enum class HardwareDevices

                {

                    eCPU,    // The CPU device.

                    eCUDA    // The CUDA device.

                };


                // Declare public methods and member variables.


                PyTorchInterpreter(std::string szModelPath, HardwareDevices eHardwareDevice = HardwareDevices::eCUDA)

                {

                    // Initialize member variables.

                    m_szModelPath      = szModelPath;

                    m_bReady           = false;

                    m_cvModelInputSize = cv::Size(640, 640);

                    m_szModelTask      = "Unknown";

                    m_vClassLabels     = std::vector<std::string>();


                    // Translate the hardware device enum to a torch device.

                    switch (eHardwareDevice)

                    {

                        case HardwareDevices::eCPU: m_trDevice = torch::kCPU; break;

                        case HardwareDevices::eCUDA: m_trDevice = torch::kCUDA; break;

                        default: m_trDevice = torch::kCPU; break;

                    }


                    // Submit logger message.

                    LOG_INFO(logging::g_qSharedLogger, "Attempting to load model {} onto device {}", szModelPath, m_trDevice.str());


                    // Check if the model path is valid.

                    if (!std::filesystem::exists(szModelPath))

                    {

                        // Submit logger message.

                        LOG_ERROR(logging::g_qSharedLogger, "Model path {} does not exist!", szModelPath);

                        return;

                    }

                    // Check if the device is available.

                    if (!torch::cuda::is_available() && m_trDevice == torch::kCUDA)

                    {

                        // Submit logger message.

                        LOG_ERROR(logging::g_qSharedLogger, "CUDA device is not available, falling back to CPU.");

                        m_trDevice = torch::kCPU;

                        return;

                    }

                    else

                    {

                        // Submit logger message.

                        LOG_INFO(logging::g_qSharedLogger, "Using device: {}", m_trDevice.str());

                    }


                    // Finally, attempt to load the model.

                    try

                    {

                        // Load the model and set it to eval mode for inference.

                        torch::jit::ExtraFilesMap trExtraConfigFiles{{"config.txt", ""}};

                        m_trModel = torch::jit::load(szModelPath, m_trDevice, trExtraConfigFiles);

                        m_trModel.eval();


                        // Use nlohmann json to parse the config file.

                        nlohmann::json jConfig = nlohmann::json::parse(trExtraConfigFiles.at("config.txt"));

                        // Get the input image size for the model.

                        m_cvModelInputSize = cv::Size(jConfig["imgsz"][0], jConfig["imgsz"][1]);

                        m_szModelTask      = jConfig["task"];

                        for (const auto& item : jConfig["names"].items())

                        {

                            m_vClassLabels.push_back(item.value());

                        }

                        // Submit the config json as a debug message.

                        LOG_DEBUG(logging::g_qSharedLogger, "Model config: {}", jConfig.dump(4));


                        // Check if the model is empty.

                        if (m_trModel.get_methods().empty())

                        {

                            LOG_ERROR(logging::g_qSharedLogger, "Model is empty! Check if the correct model file was provided.");

                            return;

                        }

                        // Check if the model did not move to the expected device.

                        if (m_trModel.buffers().size() > 0)

                        {

                            // Get the device of the model.

                            torch::Device model_device = m_trModel.buffers().begin().operator->().device();

                            if (model_device != m_trDevice)

                            {

                                LOG_ERROR(logging::g_qSharedLogger, "Model did not move to the expected device! Model is on: {}", model_device.str());

                                return;

                            }

                        }

                        else

                        {

                            LOG_WARNING(logging::g_qSharedLogger, "Model has no buffers to check the device.");

                        }


                        // Model is ready for inference.

                        LOG_INFO(logging::g_qSharedLogger,

                                 "Model successfully loaded and set to eval mode. The model is a {} model, and has {} classes.",

                                 m_szModelTask,

                                 m_vClassLabels.size());


                        // Set flag saying we are ready for inference.

                        m_bReady = true;

                    }

                    catch (const c10::Error& trError)

                    {

                        LOG_ERROR(logging::g_qSharedLogger, "Error loading model: {}", trError.what());

                    }

                }


                ~PyTorchInterpreter()

                {

                    // Nothing to destroy.

                }


                std::vector<Detection> Inference(const cv::Mat& cvInputFrame, const float fMinObjectConfidence = 0.85, const float fNMSThreshold = 0.6)

                {

                    // Force single-threaded execution (if acceptable for your workload)

                    torch::set_num_threads(1);

                    // Create instance variables.

                    std::vector<Detection> vObjects;


                    // Preprocess the given image and pack int into an image.

                    torch::Tensor trTensorImage = PreprocessImage(cvInputFrame, m_trDevice);


                    // Perform inference.

                    std::vector<torch::jit::IValue> vInputs;

                    vInputs.push_back(trTensorImage);

                    torch::Tensor trOutputTensor;

                    try

                    {

                        trOutputTensor = m_trModel.forward(vInputs).toTensor();

                    }

                    catch (const c10::Error& trError)

                    {

                        LOG_ERROR(logging::g_qSharedLogger, "Error running inference: {}", trError.what());

                        return vObjects;

                    }


                    // Calculate the general stride sizes for YOLO based on input tensor shape.

                    int nImgSize  = m_cvModelInputSize.height;

                    int nP3Stride = std::pow((nImgSize / 8), 2);

                    int nP4Stride = std::pow((nImgSize / 16), 2);

                    int nP5Stride = std::pow((nImgSize / 32), 2);

                    // Calculate the proper prediction length for different YOLO versions.

                    int nYOLOv5AnchorsPerGridPoint = 3;

                    int nYOLOv8AnchorsPerGridPoint = 1;

                    int nYOLOv5TotalPredictionLength =

                        (nP3Stride * nYOLOv5AnchorsPerGridPoint) + (nP4Stride * nYOLOv5AnchorsPerGridPoint) + (nP5Stride * nYOLOv5AnchorsPerGridPoint);

                    int nYOLOv8TotalPredictionLength =

                        (nP3Stride * nYOLOv8AnchorsPerGridPoint) + (nP4Stride * nYOLOv8AnchorsPerGridPoint) + (nP5Stride * nYOLOv8AnchorsPerGridPoint);


                    // Parse the output tensor.

                    std::vector<int> vClassIDs;

                    std::vector<std::string> vClassLabels;

                    std::vector<float> vClassConfidences;

                    std::vector<cv::Rect> vBoundingBoxes;


                    // Get the largest dimension of our output tensor.

                    int nLargestDimension = *std::max_element(trOutputTensor.sizes().begin(), trOutputTensor.sizes().end());

                    // Check if the output tensor is YOLOv5 format.

                    if (nLargestDimension == nYOLOv5TotalPredictionLength)

                    {

                        // Parse inferenced output from tensor.

                        this->ParseTensorOutputYOLOv5(trOutputTensor, vClassIDs, vClassConfidences, vBoundingBoxes, cvInputFrame.size(), fMinObjectConfidence);

                    }

                    // Check if the output tensor is YOLOv8 format.

                    else if (nLargestDimension == nYOLOv8TotalPredictionLength)

                    {

                        // Parse inferenced output from tensor.

                        this->ParseTensorOutputYOLOv8(trOutputTensor, vClassIDs, vClassConfidences, vBoundingBoxes, cvInputFrame.size(), fMinObjectConfidence);

                    }


                    // Perform NMS to filter out bad/duplicate detections.

                    NonMaxSuppression(vObjects, vClassIDs, vClassConfidences, vBoundingBoxes, fMinObjectConfidence, fNMSThreshold);


                    // Loop through the final detections and set the class names for each detection based on the class ID.

                    for (size_t nIter = 0; nIter < vObjects.size(); ++nIter)

                    {

                        // Check if the class ID is valid.

                        if (vClassIDs[nIter] >= 0 && vClassIDs[nIter] < static_cast<int>(m_vClassLabels.size()))

                        {

                            vObjects[nIter].szClassName = m_vClassLabels[vClassIDs[nIter]];

                        }

                        else

                        {

                            vObjects[nIter].szClassName = "UnknownClass";

                        }

                    }


                    return vObjects;

                }


                bool IsReadyForInference() const { return m_bReady; }


            private:

                // Declare private methods.


                torch::Tensor PreprocessImage(const cv::Mat& cvInputFrame, const torch::Device& trDevice)

                {

                    // Resize the input image to match model and normalize it to 0-1.

                    cv::Mat cvResizedImage;

                    cv::resize(cvInputFrame, cvResizedImage, cv::Size(m_cvModelInputSize.width, m_cvModelInputSize.height), cv::INTER_LINEAR);

                    cvResizedImage.convertTo(cvResizedImage, CV_32FC3, 1.0 / 255.0);


                    // Convert OpenCV mat to a tensor.

                    torch::Tensor trTensorImage = torch::from_blob(cvResizedImage.data, {1, cvResizedImage.rows, cvResizedImage.cols, 3}, torch::kFloat);

                    trTensorImage               = trTensorImage.permute({0, 3, 1, 2});    // Convert to CxHxW format.

                    trTensorImage               = trTensorImage.to(trDevice);             // Move tensor to the specified hardware device.


                    return trTensorImage;

                }


                void ParseTensorOutputYOLOv5(const torch::Tensor& trOutput,

                                             std::vector<int>& vClassIDs,

                                             std::vector<float>& vClassConfidences,

                                             std::vector<cv::Rect>& vBoundingBoxes,

                                             const cv::Size& cvInputFrameSize,

                                             const float fMinObjectConfidence)

                {

                    /*

                     * For YOLOv5, you divide your image size, i.e. 640 by the P3, P4, P5 output strides of 8, 16, 32 to arrive at grid sizes

                     * of 80x80, 40x40, 20x20. Each grid point has 3 anchors by default (anchor box values: small, medium, large), and each anchor contains a vector 5 +

                     * nc long, where nc is the number of classes the model has. So for a 640 image, the output tensor will be [1, 25200, 85]

                     */

                    // Squeeze the batch dimension from the output tensor.

                    torch::Tensor trSqueezedOutput = trOutput.squeeze(0);


                    // Move the tensor to CPU if necessary. If we're using GPU and we don't move the tensor to CPU, we will get an error and it will be slow.

                    if (trSqueezedOutput.device().is_cuda())

                    {

                        trSqueezedOutput = trSqueezedOutput.to(torch::kCPU);

                    }

                    // Convert tensor to float if necessary.

                    if (trSqueezedOutput.scalar_type() != torch::kFloat32)

                    {

                        trSqueezedOutput = trSqueezedOutput.to(torch::kFloat32);

                    }

                    // Ensure tensor is contiguous in memory.

                    if (!trSqueezedOutput.is_contiguous())

                    {

                        trSqueezedOutput = trSqueezedOutput.contiguous();

                    }


                    // Create an accessor for fast element-wise access.

                    at::TensorAccessor trAccessor = trSqueezedOutput.accessor<float, 2>();

                    const int nNumDetections      = trSqueezedOutput.size(0);

                    const int nTotalValues        = trSqueezedOutput.size(1);    // equals 5 + number_of_classes


                    // Loop through each detection.

                    for (int i = 0; i < nNumDetections; i++)

                    {

                        // Get the objectness confidence. This is the 5th value for each grid/anchor prediction. (4th index)

                        float fObjectnessConfidence = trAccessor[i][4];


                        // Check if the object confidence is greater than or equal to the threshold.

                        if (fObjectnessConfidence < fMinObjectConfidence)

                        {

                            continue;

                        }


                        // Retrieve bounding box data.

                        float fCenterX = trAccessor[i][0];

                        float fCenterY = trAccessor[i][1];

                        float fWidth   = trAccessor[i][2];

                        float fHeight  = trAccessor[i][3];


                        // Scale bounding box to original image size.

                        int nLeft           = static_cast<int>((fCenterX - (0.5 * fWidth)) * cvInputFrameSize.width);

                        int nTop            = static_cast<int>((fCenterY - (0.5 * fHeight)) * cvInputFrameSize.height);

                        int nBoundingWidth  = static_cast<int>(fWidth * cvInputFrameSize.width);

                        int nBoundingHeight = static_cast<int>(fHeight * cvInputFrameSize.height);


                        // Repackaged bounding box data to be more readable.

                        cv::Rect cvBoundingBox(nLeft, nTop, nBoundingWidth, nBoundingHeight);


                        // Loop over class confidence values and find the class ID with the highest confidence.

                        float fClassConfidence = -1.0f;

                        int nClassID           = -1;

                        for (int j = 5; j < nTotalValues; j++)

                        {

                            float fConfidence = trAccessor[i][j];

                            if (fConfidence > fClassConfidence)

                            {

                                fClassConfidence = fConfidence;

                                nClassID         = j - 5;

                            }

                        }


                        // Only process detections that meet the minimum confidence.

                        if (fClassConfidence < fMinObjectConfidence)

                        {

                            continue;

                        }


                        // Add data to vectors.

                        vClassIDs.emplace_back(nClassID);

                        vClassConfidences.emplace_back(fClassConfidence);

                        vBoundingBoxes.emplace_back(cvBoundingBox);

                    }

                }


                void ParseTensorOutputYOLOv8(const torch::Tensor& trOutput,

                                             std::vector<int>& vClassIDs,

                                             std::vector<float>& vClassConfidences,

                                             std::vector<cv::Rect>& vBoundingBoxes,

                                             const cv::Size& cvInputFrameSize,

                                             const float fMinObjectConfidence)

                {

                    /*

                     * Permute the output tensor shape to match the expected format of the model. If the model is YOLOv8, the output

                     * shape for a 640x640 image will be [1, 4 + nc, 8400] (nc = number of classes). Notice how the larger dimensions is swapped

                     * when compared to YOLOv5. We will permute the tensor to [1, 8400, 4 + nc] to make it easier to parse. Then squeeze the

                     * tensor to remove the batch dimension so the final shape will be [8400, 4 + nc]. Thanks pytorch for being cool with the

                     * permute function.

                     */

                    // Permute the tensor shape from [1, 4 + nc, 8400] to [1, 8400, 4 + nc]

                    // and then squeeze to remove the batch dimension, resulting in [8400, 4 + nc]

                    torch::Tensor trPermuteOutput = trOutput.permute({0, 2, 1}).squeeze(0);


                    // Move tensor to CPU if necessary. If we're using GPU and we don't move the tensor to CPU, we will get an error and it will be slow.

                    if (trPermuteOutput.device().is_cuda())

                    {

                        trPermuteOutput = trPermuteOutput.to(torch::kCPU);

                    }

                    // Convert tensor to float if necessary.

                    if (trPermuteOutput.scalar_type() != torch::kFloat32)

                    {

                        trPermuteOutput = trPermuteOutput.to(torch::kFloat32);

                    }

                    // Ensure tensor is contiguous in memory.

                    if (!trPermuteOutput.is_contiguous())

                    {

                        trPermuteOutput = trPermuteOutput.contiguous();

                    }


                    // Create an accessor for fast element-wise access.

                    at::TensorAccessor trAccessor = trPermuteOutput.accessor<float, 2>();

                    const int nNumDetections      = trPermuteOutput.size(0);

                    const int nTotalValues        = trPermuteOutput.size(1);    // equals 4 + number_of_classes


                    // Loop through each detection.

                    for (int i = 0; i < nNumDetections; i++)

                    {

                        float fClassConfidence = -1.0f;

                        int nClassID           = -1;


                        // Loop over class confidence values.

                        for (int j = 4; j < nTotalValues; j++)

                        {

                            float fConfidence = trAccessor[i][j];

                            if (fConfidence > fClassConfidence)

                            {

                                fClassConfidence = fConfidence;

                                nClassID         = j - 4;

                            }

                        }


                        // Only process detections that meet the minimum confidence.

                        if (fClassConfidence < fMinObjectConfidence)

                        {

                            continue;

                        }


                        // Retrieve bounding box data.

                        float fCenterX = trAccessor[i][0];

                        float fCenterY = trAccessor[i][1];

                        float fWidth   = trAccessor[i][2];

                        float fHeight  = trAccessor[i][3];


                        // Scale bounding box to original image size.

                        int nLeft      = static_cast<int>(fCenterX * cvInputFrameSize.width / 640.0f - (0.5f * fWidth * cvInputFrameSize.width / 640.0f));

                        int nTop       = static_cast<int>(fCenterY * cvInputFrameSize.height / 640.0f - (0.5f * fHeight * cvInputFrameSize.height / 640.0f));

                        int nBoxWidth  = static_cast<int>(fWidth * cvInputFrameSize.width / 640.0f);

                        int nBoxHeight = static_cast<int>(fHeight * cvInputFrameSize.height / 640.0f);

                        cv::Rect cvBoundingBox(nLeft, nTop, nBoxWidth, nBoxHeight);


                        // Append results.

                        vClassIDs.push_back(nClassID);

                        vClassConfidences.push_back(fClassConfidence);

                        vBoundingBoxes.push_back(cvBoundingBox);

                    }

                }


                // Declare private member variables.

                torch::jit::script::Module m_trModel;

                torch::Device m_trDevice = torch::kCPU;

                std::string m_szModelPath;

                bool m_bReady;

                std::string m_szModelTask;

                cv::Size m_cvModelInputSize;

                std::vector<std::string> m_vClassLabels;

        };


    }    // namespace pytorch


}    // namespace yolomodel


#endif

TensorflowTPU
This class is designed to enable quick, easy, and robust handling of .tflite models for deployment an...
Definition TensorflowTPU.hpp:39

cv::Mat

cv::Mat::size
MatSize size

cv::Mat::data
uchar * data

cv::Mat::cols
int cols

cv::Mat::at
_Tp & at(int i0=0)

cv::Mat::elemSize
size_t elemSize() const

cv::Mat::rows
int rows

cv::Mat::convertTo
void convertTo(OutputArray m, int rtype, double alpha=1, double beta=0) const

cv::Mat::type
int type() const

Point_< int >

cv::Rect_

cv::Rect_::x
_Tp x

cv::Rect_::y
_Tp y

cv::Rect_::width
_Tp width

cv::Rect_::height
_Tp height

Scalar_< double >

cv::Size_

cv::Size_::height
_Tp height

cv::Size_::width
_Tp width

cv::Vec

yolomodel::pytorch::PyTorchInterpreter
This class is designed to enable quick, easy, and robust inferencing of .pt yolo model.
Definition YOLOModel.hpp:710

yolomodel::pytorch::PyTorchInterpreter::ParseTensorOutputYOLOv8
void ParseTensorOutputYOLOv8(const torch::Tensor &trOutput, std::vector< int > &vClassIDs, std::vector< float > &vClassConfidences, std::vector< cv::Rect > &vBoundingBoxes, const cv::Size &cvInputFrameSize, const float fMinObjectConfidence)
Given a tensor output from a YOLOv5 model, parse it's output into something more usable.
Definition YOLOModel.hpp:1101

yolomodel::pytorch::PyTorchInterpreter::IsReadyForInference
bool IsReadyForInference() const
Check if the model is ready for inference.
Definition YOLOModel.hpp:946

yolomodel::pytorch::PyTorchInterpreter::Inference
std::vector< Detection > Inference(const cv::Mat &cvInputFrame, const float fMinObjectConfidence=0.85, const float fNMSThreshold=0.6)
Given an input image forward the image through the YOLO model to run inference on the PyTorch model,...
Definition YOLOModel.hpp:859

yolomodel::pytorch::PyTorchInterpreter::ParseTensorOutputYOLOv5
void ParseTensorOutputYOLOv5(const torch::Tensor &trOutput, std::vector< int > &vClassIDs, std::vector< float > &vClassConfidences, std::vector< cv::Rect > &vBoundingBoxes, const cv::Size &cvInputFrameSize, const float fMinObjectConfidence)
Given a tensor output from a YOLOv5 model, parse it's output into something more usable.
Definition YOLOModel.hpp:993

yolomodel::pytorch::PyTorchInterpreter::PyTorchInterpreter
PyTorchInterpreter(std::string szModelPath, HardwareDevices eHardwareDevice=HardwareDevices::eCUDA)
Construct a new PyTorchInterpreter object.
Definition YOLOModel.hpp:734

yolomodel::pytorch::PyTorchInterpreter::~PyTorchInterpreter
~PyTorchInterpreter()
Destroy the PyTorchInterpreter object.
Definition YOLOModel.hpp:839

yolomodel::pytorch::PyTorchInterpreter::PreprocessImage
torch::Tensor PreprocessImage(const cv::Mat &cvInputFrame, const torch::Device &trDevice)
Given an input image, preprocess the image to match the input tensor shape of the model,...
Definition YOLOModel.hpp:964

yolomodel::tensorflow::TPUInterpreter
This class is designed to enable quick, easy, and robust inferencing of .tflite yolo model.
Definition YOLOModel.hpp:214

yolomodel::tensorflow::TPUInterpreter::TPUInterpreter
TPUInterpreter(std::string szModelPath, PerformanceModes ePowerMode=PerformanceModes::eHigh, unsigned int unMaxBulkInQueueLength=32, bool bUSBAlwaysDFU=false)
Construct a new TPUInterpreter object.
Definition YOLOModel.hpp:240

yolomodel::tensorflow::TPUInterpreter::Inference
std::vector< std::vector< Detection > > Inference(const cv::Mat &cvInputFrame, const float fMinObjectConfidence=0.85, const float fNMSThreshold=0.6) override
Given an input image forward the image through the YOLO model to run inference on the EdgeTPU,...
Definition YOLOModel.hpp:277

yolomodel::tensorflow::TPUInterpreter::~TPUInterpreter
~TPUInterpreter()
Destroy the TPUInterpreter object.
Definition YOLOModel.hpp:255

yolomodel::tensorflow::TPUInterpreter::GetOutputShape
OutputTensorDimensions GetOutputShape(const int nTensorIndex=0)
Get the output shape of the tensor at the given index. Requires the device to have been successfully ...
Definition YOLOModel.hpp:659

yolomodel::tensorflow::TPUInterpreter::GetInputShape
InputTensorDimensions GetInputShape(const int nTensorIndex=0)
Get the input shape of the tensor at the given index. Requires the device to have been successfully o...
Definition YOLOModel.hpp:622

yolomodel::tensorflow::TPUInterpreter::ParseTensorOutputYOLOv5
void ParseTensorOutputYOLOv5(int nOutputIndex, std::vector< int > &vClassIDs, std::vector< float > &vClassConfidences, std::vector< cv::Rect > &vBoundingBoxes, float fMinObjectConfidence, int nOriginalFrameWidth, int nOriginalFrameHeight)
Given a TFLite output tensor from a YOLOv5 model, parse it's output into something more usable....
Definition YOLOModel.hpp:448

yolomodel::tensorflow::TPUInterpreter::ParseTensorOutputYOLOv8
void ParseTensorOutputYOLOv8(int nOutputIndex, std::vector< int > &vClassIDs, std::vector< float > &vClassConfidences, std::vector< cv::Rect > &vBoundingBoxes, float fMinObjectConfidence, int nOriginalFrameWidth, int nOriginalFrameHeight)
Given a TFLite output tensor from a YOLOv8 model, parse it's output into something more usable....
Definition YOLOModel.hpp:544

cv::Size
Size2i Size

CV_32FC3
#define CV_32FC3

CV_8UC3
#define CV_8UC3

cv::dnn::NMSBoxes
void NMSBoxes(const std::vector< Rect > &bboxes, const std::vector< float > &scores, const float score_threshold, const float nms_threshold, std::vector< int > &indices, const float eta=1.f, const int top_k=0)

cv::cvtColor
void cvtColor(InputArray src, OutputArray dst, int code, int dstCn=0)

cv::COLOR_HSV2BGR
COLOR_HSV2BGR

cv::rectangle
void rectangle(InputOutputArray img, Point pt1, Point pt2, const Scalar &color, int thickness=1, int lineType=LINE_8, int shift=0)

cv::putText
void putText(InputOutputArray img, const String &text, Point org, int fontFace, double fontScale, Scalar color, int thickness=1, int lineType=LINE_8, bool bottomLeftOrigin=false)

cv::FONT_HERSHEY_SIMPLEX
FONT_HERSHEY_SIMPLEX

cv::FILLED
FILLED

cv::resize
void resize(InputArray src, OutputArray dst, Size dsize, double fx=0, double fy=0, int interpolation=INTER_LINEAR)

cv::INTER_LINEAR
INTER_LINEAR

cv

yolomodel
Namespace containing functions or objects/struct used to aid in easy use of YOLO models....
Definition YOLOModel.hpp:36

yolomodel::NonMaxSuppression
void NonMaxSuppression(std::vector< Detection > &vObjects, std::vector< int > &vClassIDs, std::vector< float > &vClassConfidences, std::vector< cv::Rect > &vBoundingBoxes, float fMinObjectConfidence, float fNMSThreshold)
Perform non max suppression for the given predictions. This eliminates/combines predictions that over...
Definition YOLOModel.hpp:71

yolomodel::DrawDetections
void DrawDetections(cv::Mat &cvInputFrame, std::vector< Detection > &vObjects)
Given an image and a vector of object structs, draw each object bounding box, class type,...
Definition YOLOModel.hpp:109

std

yolomodel::Detection
This struct is used to.
Definition YOLOModel.hpp:45

yolomodel::tensorflow::InputTensorDimensions
This struct is used to store the dimensions of an input tensor for a yolo model.
Definition YOLOModel.hpp:166

yolomodel::tensorflow::OutputTensorDimensions
This struct is used to store the dimensions of an output tensor for a yolo model.
Definition YOLOModel.hpp:189