Autonomy Software C++ 24.5.1
Welcome to the Autonomy Software repository of the Mars Rover Design Team (MRDT) at Missouri University of Science and Technology (Missouri S&T)! API reference contains the source code and other resources for the development of the autonomy software for our Mars rover. The Autonomy Software project aims to compete in the University Rover Challenge (URC) by demonstrating advanced autonomous capabilities and robust navigation algorithms.
Loading...
Searching...
No Matches
YOLOModel.hpp
Go to the documentation of this file.
1
13#ifndef YOLO_MODEL_HPP
14#define YOLO_MODEL_HPP
15
16#include "../../AutonomyConstants.h"
17#include "../../interfaces/TensorflowTPU.hpp"
18
20#include <nlohmann/json.hpp>
21#include <opencv2/opencv.hpp>
22#include <torch/script.h>
23#include <torch/torch.h>
24
26
27
35namespace yolomodel
36{
37
44 struct Detection
45 {
46 public:
48 // Define public struct attributes.
50
51 int nClassID; // The class index of the object. Dependent on class order when trained.
52 std::string szClassName; // The class name of the object. This is dependent on the class names used when training.
53 float fConfidence; // The detection confidence of the object.
54 cv::Rect cvBoundingBox; // An object used to access the dimensions and other properties of the objects bounding box.
55 };
56
57
71 inline void NonMaxSuppression(std::vector<Detection>& vObjects,
72 std::vector<int>& vClassIDs,
73 std::vector<float>& vClassConfidences,
74 std::vector<cv::Rect>& vBoundingBoxes,
75 float fMinObjectConfidence,
76 float fNMSThreshold)
77 {
78 // Create instance variables.
79 std::vector<int> vNMSValidIndices;
80
81 // Perform Non-Max Suppression using OpenCV's implementation.
82 cv::dnn::NMSBoxes(vBoundingBoxes, vClassConfidences, fMinObjectConfidence, fNMSThreshold, vNMSValidIndices);
83
84 // Loop through each valid index.
85 for (int nValidIndex : vNMSValidIndices)
86 {
87 // Create new Detection struct.
88 Detection stNewDetection;
89 // Repackage prediction data into easy-to-use struct.
90 stNewDetection.nClassID = vClassIDs[nValidIndex];
91 stNewDetection.fConfidence = vClassConfidences[nValidIndex];
92 stNewDetection.cvBoundingBox = vBoundingBoxes[nValidIndex];
93
94 // Append new object detection to objects vector.
95 vObjects.emplace_back(stNewDetection);
96 }
97 }
98
99
109 inline void DrawDetections(cv::Mat& cvInputFrame, std::vector<Detection>& vObjects)
110 {
111 // Loop through each detection.
112 for (Detection stObject : vObjects)
113 {
114 // Calculate the hue value based on the class ID.
115 int nHue = static_cast<int>(stObject.nClassID % 256);
116 // Set saturation and value to 1.0 for full intensity colors.
117 int nSaturation = 255;
118 int nValue = 255;
119
120 // Convert HSV to RGB
121 cv::Mat cvHSV(1, 1, CV_8UC3, cv::Scalar(nHue, nSaturation, nValue));
122 cv::cvtColor(cvHSV, cvHSV, cv::COLOR_HSV2BGR);
123 // Extract the RGB values
124 cv::Vec3b cvConvertedValues = cvHSV.at<cv::Vec3b>(0, 0);
125 cv::Scalar cvBoxColor(cvConvertedValues[2], cvConvertedValues[1], cvConvertedValues[0]);
126
127 // Draw bounding box onto image.
128 cv::rectangle(cvInputFrame, stObject.cvBoundingBox, cvBoxColor, 2);
129 // Draw classID background box onto image.
130 cv::rectangle(cvInputFrame,
131 cv::Point(stObject.cvBoundingBox.x, stObject.cvBoundingBox.y - 20),
132 cv::Point(stObject.cvBoundingBox.x + stObject.cvBoundingBox.width, stObject.cvBoundingBox.y),
133 cvBoxColor,
134 cv::FILLED);
135 // Draw class text onto image.
136 cv::putText(cvInputFrame,
137 std::to_string(stObject.nClassID) + " " + std::to_string(stObject.fConfidence),
138 cv::Point(stObject.cvBoundingBox.x, stObject.cvBoundingBox.y - 5),
140 0.5,
141 cv::Scalar(255, 255, 255));
142 }
143 }
144
145
155 namespace tensorflow
156 {
157
166 {
167 public:
169 // Define public struct attributes.
171
172 int nHeight; // The height of the input image.
173 int nWidth; // The width of the input image.
174 int nChannels; // The number of channels of the input image.
175 int nTensorIndex; // The index of the tensor used to retrieve it from the interpreter.
176 int nQuantZeroPoint; // The value of the quantized input tensor that represents zero.
177 float fQuantScale; // The multiplier of each value to scale to meaningful numbers. (quantization)
178 };
179
180
189 {
190 public:
192 // Define public struct attributes.
194
195 int nAnchors; // Determined from the trained image size of the model.
196 int nObjectnessLocationClasses; // The number of data points of each anchor. Each anchor contains a vector 5+nc (YOLOv5) or 4+nc (YOLOv8) long, where
197 // nc is the number of classes The model has.
198 int nTensorIndex; // The index of the tensor used to retrieve it from the interpreter.
199 int nQuantZeroPoint; // The value of the quantized output tensor that represents zero.
200 float fQuantScale; // The multiplier of each value to scale to meaningful numbers. (Undo quantization)
201 };
202
203
213 class TPUInterpreter : public TensorflowTPU<std::vector<std::vector<Detection>>, cv::Mat>
214 {
215 public:
217 // Declare public enums that are specific to and used withing this class.
219
221 // Declare public methods and member variables.
223
224
240 TPUInterpreter(std::string szModelPath,
241 PerformanceModes ePowerMode = PerformanceModes::eHigh,
242 unsigned int unMaxBulkInQueueLength = 32,
243 bool bUSBAlwaysDFU = false) :
244 TensorflowTPU<std::vector<std::vector<Detection>>, cv::Mat>(szModelPath, ePowerMode, unMaxBulkInQueueLength, bUSBAlwaysDFU)
245
246 {}
247
248
256 {
257 // Nothing to destroy.
258 }
259
260
277 std::vector<std::vector<Detection>> Inference(const cv::Mat& cvInputFrame,
278 const float fMinObjectConfidence = 0.85,
279 const float fNMSThreshold = 0.6) override
280 {
281 // Create instance variables.
282 std::vector<std::vector<Detection>> vTensorObjectOutputs;
283
284 // Get the input tensor shape for the model.
285 InputTensorDimensions stInputDimensions = this->GetInputShape(m_pInterpreter->inputs()[0]);
286
287 // Copy given frame to class member variable.
288 m_cvFrame = cvInputFrame;
289
290 // Check if model is open and device is ready.
291 if (m_bDeviceOpened && m_pEdgeTPUContext->IsReady())
292 {
293 // Check if the image has the correct type.
294 if (m_cvFrame.type() != CV_8UC3)
295 {
296 // Convert image to unsigned int8 image.
297 m_cvFrame.convertTo(m_cvFrame, CV_8UC3);
298 }
299
300 // Check if the input image matches the input tensor shape.
301 if (m_cvFrame.rows != stInputDimensions.nHeight || m_cvFrame.cols != stInputDimensions.nWidth)
302 {
303 // Resize the image, and store a local copy of it.
304 cv::resize(m_cvFrame,
305 m_cvFrame,
306 cv::Size(stInputDimensions.nWidth, stInputDimensions.nHeight),
307 constants::BASICCAM_RESIZE_INTERPOLATION_METHOD);
308 }
309
310 // Create a vector to store reshaped input image in 1 dimension.
311 std::vector<int8_t> vInputData(m_cvFrame.data,
312 m_cvFrame.data + (static_cast<unsigned long>(m_cvFrame.cols) * m_cvFrame.rows * m_cvFrame.elemSize()));
313 // Quantize input data.
314 // for (long unsigned int nIter = 0; nIter < vInputData.size(); ++nIter)
315 // {
316 // // Quantize value.
317 // vInputData[nIter] = std::round((vInputData[nIter] - 128) / stInputDimensions.fQuantScale) + stInputDimensions.nQuantZeroPoint;
318 // // vInputData[nIter] = vInputData[nIter] - 128;
319 // }
320 // Retrieve a new input tensor from the TPU interpreter and copy data to it. This tensor is automatically quantized because it is typed.
321 TfLiteTensor* pInputTensor = m_pInterpreter->tensor(stInputDimensions.nTensorIndex);
322 std::memcpy(pInputTensor->data.raw, vInputData.data(), vInputData.size());
323
324 // Run inference on the EdgeTPU.
325 if (m_pInterpreter->Invoke() != kTfLiteOk)
326 {
327 // Submit logger message.
328 LOG_WARNING(logging::g_qSharedLogger,
329 "Inferencing failed on an image for model {} with device {} ({})",
330 m_szModelPath,
331 m_tpuDevice.path,
332 this->DeviceTypeToString(m_tpuDevice.type));
333 }
334 else
335 {
336 // Create separate vectors for storing class confidences, bounding boxes, and classIDs.
337 std::vector<int> vClassIDs;
338 std::vector<float> vClassConfidences;
339 std::vector<cv::Rect> vBoundingBoxes;
340 // Create vector for storing all detections for this tensor output.
341 std::vector<Detection> vObjects;
342
343 // Get output indices for output tensors.
344 for (int nTensorIndex : m_pInterpreter->outputs())
345 {
346 // Clear prediction data vectors.
347 vClassIDs.clear();
348 vClassConfidences.clear();
349 vBoundingBoxes.clear();
350 // Clear object detections vector.
351 vObjects.clear();
352
353 /*
354 Check if the output tensor has a YOLOv5 format.
355 */
356 // Get the tensor output shape details.
357 OutputTensorDimensions stOutputDimensions = this->GetOutputShape(nTensorIndex);
358 // Calculate the general stride sizes for YOLO based on input tensor shape.
359 int nImgSize = stInputDimensions.nHeight;
360 int nP3Stride = std::pow((nImgSize / 8), 2);
361 int nP4Stride = std::pow((nImgSize / 16), 2);
362 int nP5Stride = std::pow((nImgSize / 32), 2);
363 // Calculate the proper prediction length for different YOLO versions.
364 int nYOLOv5AnchorsPerGridPoint = 3;
365 int nYOLOv8AnchorsPerGridPoint = 1;
366 int nYOLOv5TotalPredictionLength =
367 (nP3Stride * nYOLOv5AnchorsPerGridPoint) + (nP4Stride * nYOLOv5AnchorsPerGridPoint) + (nP5Stride * nYOLOv5AnchorsPerGridPoint);
368 int nYOLOv8TotalPredictionLength =
369 (nP3Stride * nYOLOv8AnchorsPerGridPoint) + (nP4Stride * nYOLOv8AnchorsPerGridPoint) + (nP5Stride * nYOLOv8AnchorsPerGridPoint);
370
371 // Output tensor is YOLOv5 format.
372 if (stOutputDimensions.nAnchors == nYOLOv5TotalPredictionLength)
373 {
374 // Parse inferenced output from tensor.
375 this->ParseTensorOutputYOLOv5(nTensorIndex,
376 vClassIDs,
377 vClassConfidences,
378 vBoundingBoxes,
379 fMinObjectConfidence,
380 cvInputFrame.cols,
381 cvInputFrame.rows);
382 }
383 // Output tensor is YOLOv8 format.
384 else if (stOutputDimensions.nAnchors == nYOLOv8TotalPredictionLength)
385 {
386 // Parse inferenced output from tensor.
387 this->ParseTensorOutputYOLOv8(nTensorIndex,
388 vClassIDs,
389 vClassConfidences,
390 vBoundingBoxes,
391 fMinObjectConfidence,
392 cvInputFrame.cols,
393 cvInputFrame.rows);
394 }
395
396 // Perform NMS to filter out bad/duplicate detections.
397 NonMaxSuppression(vObjects, vClassIDs, vClassConfidences, vBoundingBoxes, fMinObjectConfidence, fNMSThreshold);
398
399 // Append object detections to the tensor outputs vector.
400 vTensorObjectOutputs.emplace_back(vObjects);
401 }
402 }
403 }
404 else
405 {
406 // Submit logger message.
407 LOG_WARNING(logging::g_qSharedLogger,
408 "Inferencing failed on an image for model {} with device {} ({})",
409 m_szModelPath,
410 m_tpuDevice.path,
411 this->DeviceTypeToString(m_tpuDevice.type));
412 }
413
414 return vTensorObjectOutputs;
415 }
416
417 private:
419 // Declare private methods.
421
422
448 void ParseTensorOutputYOLOv5(int nOutputIndex,
449 std::vector<int>& vClassIDs,
450 std::vector<float>& vClassConfidences,
451 std::vector<cv::Rect>& vBoundingBoxes,
452 float fMinObjectConfidence,
453 int nOriginalFrameWidth,
454 int nOriginalFrameHeight)
455 {
456 // Retrieve output tensor from interpreter.
457 TfLiteTensor* tfOutputTensor = m_pInterpreter->tensor(nOutputIndex);
458 // Get output tensor shape.
459 OutputTensorDimensions stOutputDimensions = this->GetOutputShape(nOutputIndex);
460 // Create vector for storing temporary values for this prediction.
461 std::vector<float> vGridPrediction;
462 // Resize the Grid prediction vector to match the number of classes + bounding_box + objectness score.
463 vGridPrediction.resize(stOutputDimensions.nObjectnessLocationClasses);
464
465 /*
466 Loop through each grid cell output of the model output and filter out objects that don't meet conf thresh.
467 Then, repackage into nice detection structs.
468 For YOLOv5, you divide your image size, i.e. 640 by the P3, P4, P5 output strides of 8, 16, 32 to arrive at grid sizes
469 of 80x80, 40x40, 20x20. Each grid point has 3 anchors by default (anchor box values: small, medium, large), and each anchor contains a vector 5 +
470 nc long, where nc is the number of classes the model has. So for a 640 image, the output tensor will be [1, 25200, 85]
471 */
472 for (int nIter = 0; nIter < stOutputDimensions.nAnchors; ++nIter)
473 {
474 // Get objectness confidence. This is the 5th value for each grid/anchor prediction. (4th index)
475 float fObjectnessConfidence =
476 (tfOutputTensor->data.uint8[(nIter * stOutputDimensions.nObjectnessLocationClasses) + 4] - stOutputDimensions.nQuantZeroPoint) *
477 stOutputDimensions.fQuantScale;
478
479 // Check if the object confidence is greater than or equal to the threshold.
480 if (fObjectnessConfidence >= fMinObjectConfidence)
481 {
482 // Loop through the number of object info and class confidences in the 2nd dimension.
483 // Predictions have format {center_x, center_y, width, height, object_conf, class0_conf, class1_conf, ...}
484 for (int nJter = 0; nJter < stOutputDimensions.nObjectnessLocationClasses; ++nJter)
485 {
486 // Repackage value into more usable vector. Also undo quantization the data.
487 vGridPrediction[nJter] =
488 (tfOutputTensor->data.uint8[(nIter * stOutputDimensions.nObjectnessLocationClasses) + nJter] - stOutputDimensions.nQuantZeroPoint) *
489 stOutputDimensions.fQuantScale;
490 }
491
492 // Find class ID based on which class confidence has the highest score.
493 std::vector<float>::iterator pStartIterator = vGridPrediction.begin() + 5;
494 std::vector<float>::iterator pMaxConfidence = std::max_element(pStartIterator, vGridPrediction.end());
495 int nClassID = std::distance(pStartIterator, pMaxConfidence);
496 // Get prediction confidence for class ID.
497 float fClassConfidence = vGridPrediction[nClassID + 5];
498 // Scale bounding box to match original input image size.
499 cv::Rect cvBoundingBox;
500 int nCenterX = vGridPrediction[0] * nOriginalFrameWidth;
501 int nCenterY = vGridPrediction[1] * nOriginalFrameHeight;
502 int nWidth = vGridPrediction[2] * nOriginalFrameWidth;
503 int nHeight = vGridPrediction[3] * nOriginalFrameHeight;
504 // Check if the width and height of the object are greater than zero.
505 if (nWidth > 0 && nHeight > 0)
506 {
507 // Repackaged bounding box data to be more readable.
508 cvBoundingBox.x = int(nCenterX - (0.5 * nWidth)); // Rect.x is the top-left corner not center point.
509 cvBoundingBox.y = int(nCenterY - (0.5 * nHeight)); // Rect.y is the top-left corner not center point.
510 cvBoundingBox.width = nWidth;
511 cvBoundingBox.height = nHeight;
512 // Add data to vectors.
513 vClassIDs.emplace_back(nClassID);
514 vClassConfidences.emplace_back(fClassConfidence);
515 vBoundingBoxes.emplace_back(cvBoundingBox);
516 }
517 }
518 }
519 }
520
521
544 void ParseTensorOutputYOLOv8(int nOutputIndex,
545 std::vector<int>& vClassIDs,
546 std::vector<float>& vClassConfidences,
547 std::vector<cv::Rect>& vBoundingBoxes,
548 float fMinObjectConfidence,
549 int nOriginalFrameWidth,
550 int nOriginalFrameHeight)
551 {
552 // Retrieve output tensor from interpreter.
553 TfLiteTensor* tfOutputTensor = m_pInterpreter->tensor(nOutputIndex);
554 // Get output tensor shape.
555 OutputTensorDimensions stOutputDimensions = this->GetOutputShape(nOutputIndex);
556 // Create vector for storing temporary values for this prediction.
557 std::vector<float> vGridPrediction;
558 // Resize the Grid prediction vector to match the number of classes + bounding_box + objectness score.
559 vGridPrediction.resize(stOutputDimensions.nObjectnessLocationClasses);
560
561 /*
562 Loop through each grid cell output of the model output and filter out objects that don't meet conf thresh.
563 Then, repackage into nice detection structs.
564 For YOLOv8, you divide your image size, i.e. 640 by the P3, P4, P5 output strides of 8, 16, 32 to arrive at grid sizes
565 of 80x80, 40x40, 20x20. Each grid point has 1 anchor, and each anchor contains a vector 4 + nc long, where nc is the number
566 of classes the model has. So for a 640 image, the output tensor will be [1, 84, 8400] (80 classes). Notice how the larger dimensions is swapped
567 when compared to YOLOv8.
568 */
569 for (int nIter = 0; nIter < stOutputDimensions.nAnchors; ++nIter)
570 {
571 // Loop through the number of object info and class confidences in the 2nd dimension.
572 // Predictions have format {center_x, center_y, width, height, class0_conf, class1_conf, ...}
573 std::string szTest = "";
574 for (int nJter = 0; nJter < stOutputDimensions.nObjectnessLocationClasses; ++nJter)
575 {
576 // Repackage values into more usable vector. Also undo quantization the data.
577 vGridPrediction[nJter] = (tfOutputTensor->data.int8[nIter + (nJter * stOutputDimensions.nAnchors)] - stOutputDimensions.nQuantZeroPoint) *
578 stOutputDimensions.fQuantScale;
579 }
580
581 // Find class ID based on which class confidence has the highest score.
582 std::vector<float>::iterator pStartIterator = vGridPrediction.begin() + 4;
583 std::vector<float>::iterator pMaxConfidence = std::max_element(pStartIterator, vGridPrediction.end());
584 int nClassID = std::distance(pStartIterator, pMaxConfidence);
585 // Get prediction confidence for class ID.
586 float fClassConfidence = vGridPrediction[nClassID + 4];
587
588 // Check if class confidence meets threshold.
589 if (fClassConfidence >= fMinObjectConfidence)
590 {
591 // Scale bounding box to match original input image size.
592 cv::Rect cvBoundingBox;
593 int nCenterX = vGridPrediction[0] * nOriginalFrameWidth;
594 int nCenterY = vGridPrediction[1] * nOriginalFrameHeight;
595 int nWidth = vGridPrediction[2] * nOriginalFrameWidth;
596 int nHeight = vGridPrediction[3] * nOriginalFrameHeight;
597 // Repackaged bounding box data to be more readable.
598 cvBoundingBox.x = int(nCenterX - (0.5 * nWidth)); // Rect.x is the top-left corner not center point.
599 cvBoundingBox.y = int(nCenterY - (0.5 * nHeight)); // Rect.y is the top-left corner not center point.
600 cvBoundingBox.width = nWidth;
601 cvBoundingBox.height = nHeight;
602 // Add data to vectors.
603 vClassIDs.emplace_back(nClassID);
604 vClassConfidences.emplace_back(fClassConfidence);
605 vBoundingBoxes.emplace_back(cvBoundingBox);
606 }
607 }
608 }
609
610
622 InputTensorDimensions GetInputShape(const int nTensorIndex = 0)
623 {
624 // Create instance variables.
625 InputTensorDimensions stInputDimensions = {0, 0, 0, 0, 0, 0};
626
627 // Check if interpreter has been built.
628 if (m_bDeviceOpened)
629 {
630 // Get the desired input tensor shape of the model.
631 TfLiteTensor* tfInputTensor = m_pInterpreter->tensor(nTensorIndex);
632 TfLiteIntArray* tfDimensions = tfInputTensor->dims;
633
634 // Package dimensions into struct.
635 stInputDimensions.nHeight = tfDimensions->data[1];
636 stInputDimensions.nWidth = tfDimensions->data[2];
637 stInputDimensions.nChannels = tfDimensions->data[3];
638 stInputDimensions.nTensorIndex = nTensorIndex;
639 // Get the quantization zero point and scale for output tensor.
640 stInputDimensions.nQuantZeroPoint = tfInputTensor->params.zero_point;
641 stInputDimensions.fQuantScale = tfInputTensor->params.scale;
642 }
643
644 return stInputDimensions;
645 }
646
647
659 OutputTensorDimensions GetOutputShape(const int nTensorIndex = 0)
660 {
661 // Create instance variables.
662 OutputTensorDimensions stOutputDimensions = {0, 0, 0, 0, 0};
663
664 // Check if interpreter has been built.
665 if (m_bDeviceOpened)
666 {
667 // Get the desired output tensor shape of the model.
668 TfLiteTensor* tfOutputTensor = m_pInterpreter->tensor(nTensorIndex);
669 TfLiteIntArray* tfDimensions = tfOutputTensor->dims;
670
671 // Package dimensions into struct. Assume anchors will always be the longer dimension.
672 stOutputDimensions.nAnchors = std::max(tfDimensions->data[1], tfDimensions->data[2]);
673 stOutputDimensions.nObjectnessLocationClasses = std::min(tfDimensions->data[1], tfDimensions->data[2]);
674 stOutputDimensions.nTensorIndex = nTensorIndex;
675 // Get the quantization zero point and scale for output tensor.
676 stOutputDimensions.nQuantZeroPoint = tfOutputTensor->params.zero_point;
677 stOutputDimensions.fQuantScale = tfOutputTensor->params.scale;
678 }
679
680 return stOutputDimensions;
681 }
682
684 // Declare private member variables.
686 cv::Mat m_cvFrame;
687 };
688 } // namespace tensorflow
689
690
700 namespace pytorch
701 {
702
710 {
711 public:
713 // Declare public enums that are specific to and used within this class.
715 enum class HardwareDevices
716 {
717 eCPU, // The CPU device.
718 eCUDA // The CUDA device.
719 };
720
722 // Declare public methods and member variables.
724
725
734 PyTorchInterpreter(std::string szModelPath, HardwareDevices eHardwareDevice = HardwareDevices::eCUDA)
735 {
736 // Initialize member variables.
737 m_szModelPath = szModelPath;
738 m_bReady = false;
739 m_cvModelInputSize = cv::Size(640, 640);
740 m_szModelTask = "Unknown";
741 m_vClassLabels = std::vector<std::string>();
742
743 // Translate the hardware device enum to a torch device.
744 switch (eHardwareDevice)
745 {
746 case HardwareDevices::eCPU: m_trDevice = torch::kCPU; break;
747 case HardwareDevices::eCUDA: m_trDevice = torch::kCUDA; break;
748 default: m_trDevice = torch::kCPU; break;
749 }
750
751 // Submit logger message.
752 LOG_INFO(logging::g_qSharedLogger, "Attempting to load model {} onto device {}", szModelPath, m_trDevice.str());
753
754 // Check if the model path is valid.
755 if (!std::filesystem::exists(szModelPath))
756 {
757 // Submit logger message.
758 LOG_ERROR(logging::g_qSharedLogger, "Model path {} does not exist!", szModelPath);
759 return;
760 }
761 // Check if the device is available.
762 if (!torch::cuda::is_available() && m_trDevice == torch::kCUDA)
763 {
764 // Submit logger message.
765 LOG_ERROR(logging::g_qSharedLogger, "CUDA device is not available, falling back to CPU.");
766 m_trDevice = torch::kCPU;
767 return;
768 }
769 else
770 {
771 // Submit logger message.
772 LOG_INFO(logging::g_qSharedLogger, "Using device: {}", m_trDevice.str());
773 }
774
775 // Finally, attempt to load the model.
776 try
777 {
778 // Load the model and set it to eval mode for inference.
779 torch::jit::ExtraFilesMap trExtraConfigFiles{{"config.txt", ""}};
780 m_trModel = torch::jit::load(szModelPath, m_trDevice, trExtraConfigFiles);
781 m_trModel.eval();
782
783 // Use nlohmann json to parse the config file.
784 nlohmann::json jConfig = nlohmann::json::parse(trExtraConfigFiles.at("config.txt"));
785 // Get the input image size for the model.
786 m_cvModelInputSize = cv::Size(jConfig["imgsz"][0], jConfig["imgsz"][1]);
787 m_szModelTask = jConfig["task"];
788 for (const auto& item : jConfig["names"].items())
789 {
790 m_vClassLabels.push_back(item.value());
791 }
792 // Submit the config json as a debug message.
793 LOG_DEBUG(logging::g_qSharedLogger, "Model config: {}", jConfig.dump(4));
794
795 // Check if the model is empty.
796 if (m_trModel.get_methods().empty())
797 {
798 LOG_ERROR(logging::g_qSharedLogger, "Model is empty! Check if the correct model file was provided.");
799 return;
800 }
801 // Check if the model did not move to the expected device.
802 if (m_trModel.buffers().size() > 0)
803 {
804 // Get the device of the model.
805 torch::Device model_device = m_trModel.buffers().begin().operator->().device();
806 if (model_device != m_trDevice)
807 {
808 LOG_ERROR(logging::g_qSharedLogger, "Model did not move to the expected device! Model is on: {}", model_device.str());
809 return;
810 }
811 }
812 else
813 {
814 LOG_WARNING(logging::g_qSharedLogger, "Model has no buffers to check the device.");
815 }
816
817 // Model is ready for inference.
818 LOG_INFO(logging::g_qSharedLogger,
819 "Model successfully loaded and set to eval mode. The model is a {} model, and has {} classes.",
820 m_szModelTask,
821 m_vClassLabels.size());
822
823 // Set flag saying we are ready for inference.
824 m_bReady = true;
825 }
826 catch (const c10::Error& trError)
827 {
828 LOG_ERROR(logging::g_qSharedLogger, "Error loading model: {}", trError.what());
829 }
830 }
831
832
840 {
841 // Nothing to destroy.
842 }
843
844
859 std::vector<Detection> Inference(const cv::Mat& cvInputFrame, const float fMinObjectConfidence = 0.85, const float fNMSThreshold = 0.6)
860 {
861 // Force single-threaded execution (if acceptable for your workload)
862 torch::set_num_threads(1);
863 // Create instance variables.
864 std::vector<Detection> vObjects;
865
866 // Preprocess the given image and pack int into an image.
867 torch::Tensor trTensorImage = PreprocessImage(cvInputFrame, m_trDevice);
868
869 // Perform inference.
870 std::vector<torch::jit::IValue> vInputs;
871 vInputs.push_back(trTensorImage);
872 torch::Tensor trOutputTensor;
873 try
874 {
875 trOutputTensor = m_trModel.forward(vInputs).toTensor();
876 }
877 catch (const c10::Error& trError)
878 {
879 LOG_ERROR(logging::g_qSharedLogger, "Error running inference: {}", trError.what());
880 return vObjects;
881 }
882
883 // Calculate the general stride sizes for YOLO based on input tensor shape.
884 int nImgSize = m_cvModelInputSize.height;
885 int nP3Stride = std::pow((nImgSize / 8), 2);
886 int nP4Stride = std::pow((nImgSize / 16), 2);
887 int nP5Stride = std::pow((nImgSize / 32), 2);
888 // Calculate the proper prediction length for different YOLO versions.
889 int nYOLOv5AnchorsPerGridPoint = 3;
890 int nYOLOv8AnchorsPerGridPoint = 1;
891 int nYOLOv5TotalPredictionLength =
892 (nP3Stride * nYOLOv5AnchorsPerGridPoint) + (nP4Stride * nYOLOv5AnchorsPerGridPoint) + (nP5Stride * nYOLOv5AnchorsPerGridPoint);
893 int nYOLOv8TotalPredictionLength =
894 (nP3Stride * nYOLOv8AnchorsPerGridPoint) + (nP4Stride * nYOLOv8AnchorsPerGridPoint) + (nP5Stride * nYOLOv8AnchorsPerGridPoint);
895
896 // Parse the output tensor.
897 std::vector<int> vClassIDs;
898 std::vector<std::string> vClassLabels;
899 std::vector<float> vClassConfidences;
900 std::vector<cv::Rect> vBoundingBoxes;
901
902 // Get the largest dimension of our output tensor.
903 int nLargestDimension = *std::max_element(trOutputTensor.sizes().begin(), trOutputTensor.sizes().end());
904 // Check if the output tensor is YOLOv5 format.
905 if (nLargestDimension == nYOLOv5TotalPredictionLength)
906 {
907 // Parse inferenced output from tensor.
908 this->ParseTensorOutputYOLOv5(trOutputTensor, vClassIDs, vClassConfidences, vBoundingBoxes, cvInputFrame.size(), fMinObjectConfidence);
909 }
910 // Check if the output tensor is YOLOv8 format.
911 else if (nLargestDimension == nYOLOv8TotalPredictionLength)
912 {
913 // Parse inferenced output from tensor.
914 this->ParseTensorOutputYOLOv8(trOutputTensor, vClassIDs, vClassConfidences, vBoundingBoxes, cvInputFrame.size(), fMinObjectConfidence);
915 }
916
917 // Perform NMS to filter out bad/duplicate detections.
918 NonMaxSuppression(vObjects, vClassIDs, vClassConfidences, vBoundingBoxes, fMinObjectConfidence, fNMSThreshold);
919
920 // Loop through the final detections and set the class names for each detection based on the class ID.
921 for (size_t nIter = 0; nIter < vObjects.size(); ++nIter)
922 {
923 // Check if the class ID is valid.
924 if (vClassIDs[nIter] >= 0 && vClassIDs[nIter] < static_cast<int>(m_vClassLabels.size()))
925 {
926 vObjects[nIter].szClassName = m_vClassLabels[vClassIDs[nIter]];
927 }
928 else
929 {
930 vObjects[nIter].szClassName = "UnknownClass";
931 }
932 }
933
934 return vObjects;
935 }
936
937
946 bool IsReadyForInference() const { return m_bReady; }
947
948 private:
950 // Declare private methods.
952
953
964 torch::Tensor PreprocessImage(const cv::Mat& cvInputFrame, const torch::Device& trDevice)
965 {
966 // Resize the input image to match model and normalize it to 0-1.
967 cv::Mat cvResizedImage;
968 cv::resize(cvInputFrame, cvResizedImage, cv::Size(m_cvModelInputSize.width, m_cvModelInputSize.height), cv::INTER_LINEAR);
969 cvResizedImage.convertTo(cvResizedImage, CV_32FC3, 1.0 / 255.0);
970
971 // Convert OpenCV mat to a tensor.
972 torch::Tensor trTensorImage = torch::from_blob(cvResizedImage.data, {1, cvResizedImage.rows, cvResizedImage.cols, 3}, torch::kFloat);
973 trTensorImage = trTensorImage.permute({0, 3, 1, 2}); // Convert to CxHxW format.
974 trTensorImage = trTensorImage.to(trDevice); // Move tensor to the specified hardware device.
975
976 return trTensorImage;
977 }
978
979
993 void ParseTensorOutputYOLOv5(const torch::Tensor& trOutput,
994 std::vector<int>& vClassIDs,
995 std::vector<float>& vClassConfidences,
996 std::vector<cv::Rect>& vBoundingBoxes,
997 const cv::Size& cvInputFrameSize,
998 const float fMinObjectConfidence)
999 {
1000 /*
1001 * For YOLOv5, you divide your image size, i.e. 640 by the P3, P4, P5 output strides of 8, 16, 32 to arrive at grid sizes
1002 * of 80x80, 40x40, 20x20. Each grid point has 3 anchors by default (anchor box values: small, medium, large), and each anchor contains a vector 5 +
1003 * nc long, where nc is the number of classes the model has. So for a 640 image, the output tensor will be [1, 25200, 85]
1004 */
1005 // Squeeze the batch dimension from the output tensor.
1006 torch::Tensor trSqueezedOutput = trOutput.squeeze(0);
1007
1008 // Move the tensor to CPU if necessary. If we're using GPU and we don't move the tensor to CPU, we will get an error and it will be slow.
1009 if (trSqueezedOutput.device().is_cuda())
1010 {
1011 trSqueezedOutput = trSqueezedOutput.to(torch::kCPU);
1012 }
1013 // Convert tensor to float if necessary.
1014 if (trSqueezedOutput.scalar_type() != torch::kFloat32)
1015 {
1016 trSqueezedOutput = trSqueezedOutput.to(torch::kFloat32);
1017 }
1018 // Ensure tensor is contiguous in memory.
1019 if (!trSqueezedOutput.is_contiguous())
1020 {
1021 trSqueezedOutput = trSqueezedOutput.contiguous();
1022 }
1023
1024 // Create an accessor for fast element-wise access.
1025 at::TensorAccessor trAccessor = trSqueezedOutput.accessor<float, 2>();
1026 const int nNumDetections = trSqueezedOutput.size(0);
1027 const int nTotalValues = trSqueezedOutput.size(1); // equals 5 + number_of_classes
1028
1029 // Loop through each detection.
1030 for (int i = 0; i < nNumDetections; i++)
1031 {
1032 // Get the objectness confidence. This is the 5th value for each grid/anchor prediction. (4th index)
1033 float fObjectnessConfidence = trAccessor[i][4];
1034
1035 // Check if the object confidence is greater than or equal to the threshold.
1036 if (fObjectnessConfidence < fMinObjectConfidence)
1037 {
1038 continue;
1039 }
1040
1041 // Retrieve bounding box data.
1042 float fCenterX = trAccessor[i][0];
1043 float fCenterY = trAccessor[i][1];
1044 float fWidth = trAccessor[i][2];
1045 float fHeight = trAccessor[i][3];
1046
1047 // Scale bounding box to original image size.
1048 int nLeft = static_cast<int>((fCenterX - (0.5 * fWidth)) * cvInputFrameSize.width);
1049 int nTop = static_cast<int>((fCenterY - (0.5 * fHeight)) * cvInputFrameSize.height);
1050 int nBoundingWidth = static_cast<int>(fWidth * cvInputFrameSize.width);
1051 int nBoundingHeight = static_cast<int>(fHeight * cvInputFrameSize.height);
1052
1053 // Repackaged bounding box data to be more readable.
1054 cv::Rect cvBoundingBox(nLeft, nTop, nBoundingWidth, nBoundingHeight);
1055
1056 // Loop over class confidence values and find the class ID with the highest confidence.
1057 float fClassConfidence = -1.0f;
1058 int nClassID = -1;
1059 for (int j = 5; j < nTotalValues; j++)
1060 {
1061 float fConfidence = trAccessor[i][j];
1062 if (fConfidence > fClassConfidence)
1063 {
1064 fClassConfidence = fConfidence;
1065 nClassID = j - 5;
1066 }
1067 }
1068
1069 // Only process detections that meet the minimum confidence.
1070 if (fClassConfidence < fMinObjectConfidence)
1071 {
1072 continue;
1073 }
1074
1075 // Add data to vectors.
1076 vClassIDs.emplace_back(nClassID);
1077 vClassConfidences.emplace_back(fClassConfidence);
1078 vBoundingBoxes.emplace_back(cvBoundingBox);
1079 }
1080 }
1081
1082
1101 void ParseTensorOutputYOLOv8(const torch::Tensor& trOutput,
1102 std::vector<int>& vClassIDs,
1103 std::vector<float>& vClassConfidences,
1104 std::vector<cv::Rect>& vBoundingBoxes,
1105 const cv::Size& cvInputFrameSize,
1106 const float fMinObjectConfidence)
1107 {
1108 /*
1109 * Permute the output tensor shape to match the expected format of the model. If the model is YOLOv8, the output
1110 * shape for a 640x640 image will be [1, 4 + nc, 8400] (nc = number of classes). Notice how the larger dimensions is swapped
1111 * when compared to YOLOv5. We will permute the tensor to [1, 8400, 4 + nc] to make it easier to parse. Then squeeze the
1112 * tensor to remove the batch dimension so the final shape will be [8400, 4 + nc]. Thanks pytorch for being cool with the
1113 * permute function.
1114 */
1115 // Permute the tensor shape from [1, 4 + nc, 8400] to [1, 8400, 4 + nc]
1116 // and then squeeze to remove the batch dimension, resulting in [8400, 4 + nc]
1117 torch::Tensor trPermuteOutput = trOutput.permute({0, 2, 1}).squeeze(0);
1118
1119 // Move tensor to CPU if necessary. If we're using GPU and we don't move the tensor to CPU, we will get an error and it will be slow.
1120 if (trPermuteOutput.device().is_cuda())
1121 {
1122 trPermuteOutput = trPermuteOutput.to(torch::kCPU);
1123 }
1124 // Convert tensor to float if necessary.
1125 if (trPermuteOutput.scalar_type() != torch::kFloat32)
1126 {
1127 trPermuteOutput = trPermuteOutput.to(torch::kFloat32);
1128 }
1129 // Ensure tensor is contiguous in memory.
1130 if (!trPermuteOutput.is_contiguous())
1131 {
1132 trPermuteOutput = trPermuteOutput.contiguous();
1133 }
1134
1135 // Create an accessor for fast element-wise access.
1136 at::TensorAccessor trAccessor = trPermuteOutput.accessor<float, 2>();
1137 const int nNumDetections = trPermuteOutput.size(0);
1138 const int nTotalValues = trPermuteOutput.size(1); // equals 4 + number_of_classes
1139
1140 // Loop through each detection.
1141 for (int i = 0; i < nNumDetections; i++)
1142 {
1143 float fClassConfidence = -1.0f;
1144 int nClassID = -1;
1145
1146 // Loop over class confidence values.
1147 for (int j = 4; j < nTotalValues; j++)
1148 {
1149 float fConfidence = trAccessor[i][j];
1150 if (fConfidence > fClassConfidence)
1151 {
1152 fClassConfidence = fConfidence;
1153 nClassID = j - 4;
1154 }
1155 }
1156
1157 // Only process detections that meet the minimum confidence.
1158 if (fClassConfidence < fMinObjectConfidence)
1159 {
1160 continue;
1161 }
1162
1163 // Retrieve bounding box data.
1164 float fCenterX = trAccessor[i][0];
1165 float fCenterY = trAccessor[i][1];
1166 float fWidth = trAccessor[i][2];
1167 float fHeight = trAccessor[i][3];
1168
1169 // Scale bounding box to original image size.
1170 int nLeft = static_cast<int>(fCenterX * cvInputFrameSize.width / 640.0f - (0.5f * fWidth * cvInputFrameSize.width / 640.0f));
1171 int nTop = static_cast<int>(fCenterY * cvInputFrameSize.height / 640.0f - (0.5f * fHeight * cvInputFrameSize.height / 640.0f));
1172 int nBoxWidth = static_cast<int>(fWidth * cvInputFrameSize.width / 640.0f);
1173 int nBoxHeight = static_cast<int>(fHeight * cvInputFrameSize.height / 640.0f);
1174 cv::Rect cvBoundingBox(nLeft, nTop, nBoxWidth, nBoxHeight);
1175
1176 // Append results.
1177 vClassIDs.push_back(nClassID);
1178 vClassConfidences.push_back(fClassConfidence);
1179 vBoundingBoxes.push_back(cvBoundingBox);
1180 }
1181 }
1182
1184 // Declare private member variables.
1186 torch::jit::script::Module m_trModel;
1187 torch::Device m_trDevice = torch::kCPU;
1188 std::string m_szModelPath;
1189 bool m_bReady;
1190 std::string m_szModelTask;
1191 cv::Size m_cvModelInputSize;
1192 std::vector<std::string> m_vClassLabels;
1193 };
1194 } // namespace pytorch
1195} // namespace yolomodel
1196
1197#endif
This class is designed to enable quick, easy, and robust handling of .tflite models for deployment an...
Definition TensorflowTPU.hpp:39
MatSize size
uchar * data
_Tp & at(int i0=0)
size_t elemSize() const
void convertTo(OutputArray m, int rtype, double alpha=1, double beta=0) const
int type() const
This class is designed to enable quick, easy, and robust inferencing of .pt yolo model.
Definition YOLOModel.hpp:710
void ParseTensorOutputYOLOv8(const torch::Tensor &trOutput, std::vector< int > &vClassIDs, std::vector< float > &vClassConfidences, std::vector< cv::Rect > &vBoundingBoxes, const cv::Size &cvInputFrameSize, const float fMinObjectConfidence)
Given a tensor output from a YOLOv5 model, parse it's output into something more usable.
Definition YOLOModel.hpp:1101
bool IsReadyForInference() const
Check if the model is ready for inference.
Definition YOLOModel.hpp:946
std::vector< Detection > Inference(const cv::Mat &cvInputFrame, const float fMinObjectConfidence=0.85, const float fNMSThreshold=0.6)
Given an input image forward the image through the YOLO model to run inference on the PyTorch model,...
Definition YOLOModel.hpp:859
void ParseTensorOutputYOLOv5(const torch::Tensor &trOutput, std::vector< int > &vClassIDs, std::vector< float > &vClassConfidences, std::vector< cv::Rect > &vBoundingBoxes, const cv::Size &cvInputFrameSize, const float fMinObjectConfidence)
Given a tensor output from a YOLOv5 model, parse it's output into something more usable.
Definition YOLOModel.hpp:993
PyTorchInterpreter(std::string szModelPath, HardwareDevices eHardwareDevice=HardwareDevices::eCUDA)
Construct a new PyTorchInterpreter object.
Definition YOLOModel.hpp:734
~PyTorchInterpreter()
Destroy the PyTorchInterpreter object.
Definition YOLOModel.hpp:839
torch::Tensor PreprocessImage(const cv::Mat &cvInputFrame, const torch::Device &trDevice)
Given an input image, preprocess the image to match the input tensor shape of the model,...
Definition YOLOModel.hpp:964
This class is designed to enable quick, easy, and robust inferencing of .tflite yolo model.
Definition YOLOModel.hpp:214
TPUInterpreter(std::string szModelPath, PerformanceModes ePowerMode=PerformanceModes::eHigh, unsigned int unMaxBulkInQueueLength=32, bool bUSBAlwaysDFU=false)
Construct a new TPUInterpreter object.
Definition YOLOModel.hpp:240
std::vector< std::vector< Detection > > Inference(const cv::Mat &cvInputFrame, const float fMinObjectConfidence=0.85, const float fNMSThreshold=0.6) override
Given an input image forward the image through the YOLO model to run inference on the EdgeTPU,...
Definition YOLOModel.hpp:277
~TPUInterpreter()
Destroy the TPUInterpreter object.
Definition YOLOModel.hpp:255
OutputTensorDimensions GetOutputShape(const int nTensorIndex=0)
Get the output shape of the tensor at the given index. Requires the device to have been successfully ...
Definition YOLOModel.hpp:659
InputTensorDimensions GetInputShape(const int nTensorIndex=0)
Get the input shape of the tensor at the given index. Requires the device to have been successfully o...
Definition YOLOModel.hpp:622
void ParseTensorOutputYOLOv5(int nOutputIndex, std::vector< int > &vClassIDs, std::vector< float > &vClassConfidences, std::vector< cv::Rect > &vBoundingBoxes, float fMinObjectConfidence, int nOriginalFrameWidth, int nOriginalFrameHeight)
Given a TFLite output tensor from a YOLOv5 model, parse it's output into something more usable....
Definition YOLOModel.hpp:448
void ParseTensorOutputYOLOv8(int nOutputIndex, std::vector< int > &vClassIDs, std::vector< float > &vClassConfidences, std::vector< cv::Rect > &vBoundingBoxes, float fMinObjectConfidence, int nOriginalFrameWidth, int nOriginalFrameHeight)
Given a TFLite output tensor from a YOLOv8 model, parse it's output into something more usable....
Definition YOLOModel.hpp:544
Size2i Size
#define CV_32FC3
#define CV_8UC3
void NMSBoxes(const std::vector< Rect > &bboxes, const std::vector< float > &scores, const float score_threshold, const float nms_threshold, std::vector< int > &indices, const float eta=1.f, const int top_k=0)
void cvtColor(InputArray src, OutputArray dst, int code, int dstCn=0)
COLOR_HSV2BGR
void rectangle(InputOutputArray img, Point pt1, Point pt2, const Scalar &color, int thickness=1, int lineType=LINE_8, int shift=0)
void putText(InputOutputArray img, const String &text, Point org, int fontFace, double fontScale, Scalar color, int thickness=1, int lineType=LINE_8, bool bottomLeftOrigin=false)
FONT_HERSHEY_SIMPLEX
void resize(InputArray src, OutputArray dst, Size dsize, double fx=0, double fy=0, int interpolation=INTER_LINEAR)
INTER_LINEAR
Namespace containing functions or objects/struct used to aid in easy use of YOLO models....
Definition YOLOModel.hpp:36
void NonMaxSuppression(std::vector< Detection > &vObjects, std::vector< int > &vClassIDs, std::vector< float > &vClassConfidences, std::vector< cv::Rect > &vBoundingBoxes, float fMinObjectConfidence, float fNMSThreshold)
Perform non max suppression for the given predictions. This eliminates/combines predictions that over...
Definition YOLOModel.hpp:71
void DrawDetections(cv::Mat &cvInputFrame, std::vector< Detection > &vObjects)
Given an image and a vector of object structs, draw each object bounding box, class type,...
Definition YOLOModel.hpp:109
This struct is used to.
Definition YOLOModel.hpp:45
This struct is used to store the dimensions of an input tensor for a yolo model.
Definition YOLOModel.hpp:166
This struct is used to store the dimensions of an output tensor for a yolo model.
Definition YOLOModel.hpp:189