15#include "../../AutonomyConstants.h"
18#include <nlohmann/json.hpp>
19#include <opencv2/opencv.hpp>
20#include <torch/script.h>
21#include <torch/torch.h>
50 std::string szClassName;
70 std::vector<int>& vClassIDs,
71 std::vector<float>& vClassConfidences,
72 std::vector<cv::Rect>& vBoundingBoxes,
73 float fMinObjectConfidence,
77 std::vector<int> vNMSValidIndices;
80 cv::dnn::NMSBoxes(vBoundingBoxes, vClassConfidences, fMinObjectConfidence, fNMSThreshold, vNMSValidIndices);
83 for (
int nValidIndex : vNMSValidIndices)
88 stNewDetection.nClassID = vClassIDs[nValidIndex];
89 stNewDetection.fConfidence = vClassConfidences[nValidIndex];
90 stNewDetection.cvBoundingBox = vBoundingBoxes[nValidIndex];
93 vObjects.emplace_back(stNewDetection);
113 int nHue =
static_cast<int>(stObject.nClassID % 256);
115 int nSaturation = 255;
123 cv::Scalar cvBoxColor(cvConvertedValues[2], cvConvertedValues[1], cvConvertedValues[0]);
126 cv::rectangle(cvInputFrame, stObject.cvBoundingBox, cvBoxColor, 2);
129 cv::Point(stObject.cvBoundingBox.x, stObject.cvBoundingBox.y - 20),
130 cv::Point(stObject.cvBoundingBox.x + stObject.cvBoundingBox.width, stObject.cvBoundingBox.y),
135 std::to_string(stObject.nClassID) +
" " + std::to_string(stObject.fConfidence),
136 cv::Point(stObject.cvBoundingBox.x, stObject.cvBoundingBox.y - 5),
168 enum class HardwareDevices
187 PyTorchInterpreter(std::string szModelPath, HardwareDevices eHardwareDevice = HardwareDevices::eCUDA)
190 m_szModelPath = szModelPath;
192 m_cvModelInputSize =
cv::Size(640, 640);
193 m_szModelTask =
"Unknown";
194 m_vClassLabels = std::vector<std::string>();
197 switch (eHardwareDevice)
199 case HardwareDevices::eCPU: m_trDevice = torch::kCPU;
break;
200 case HardwareDevices::eCUDA: m_trDevice = torch::kCUDA;
break;
201 default: m_trDevice = torch::kCPU;
break;
205 LOG_INFO(logging::g_qSharedLogger,
"Attempting to load model {} onto device {}", szModelPath, m_trDevice.str());
208 if (!std::filesystem::exists(szModelPath))
211 LOG_ERROR(logging::g_qSharedLogger,
"Model path {} does not exist!", szModelPath);
215 if (!torch::cuda::is_available() && m_trDevice == torch::kCUDA)
218 LOG_ERROR(logging::g_qSharedLogger,
"CUDA device is not available, falling back to CPU.");
219 m_trDevice = torch::kCPU;
225 LOG_INFO(logging::g_qSharedLogger,
"Using device: {}", m_trDevice.str());
232 torch::jit::ExtraFilesMap trExtraConfigFiles{{
"config.txt",
""}};
233 m_trModel = torch::jit::load(szModelPath, m_trDevice, trExtraConfigFiles);
237 nlohmann::json jConfig = nlohmann::json::parse(trExtraConfigFiles.at(
"config.txt"));
239 m_cvModelInputSize =
cv::Size(jConfig[
"imgsz"][0], jConfig[
"imgsz"][1]);
240 m_szModelTask = jConfig[
"task"];
241 for (
const auto& item : jConfig[
"names"].items())
243 m_vClassLabels.push_back(item.value());
246 LOG_DEBUG(logging::g_qSharedLogger,
"Model config: {}", jConfig.dump(4));
249 if (m_trModel.get_methods().empty())
251 LOG_ERROR(logging::g_qSharedLogger,
"Model is empty! Check if the correct model file was provided.");
255 if (m_trModel.buffers().size() > 0)
258 torch::Device model_device = m_trModel.buffers().begin().operator->().device();
259 if (model_device != m_trDevice)
261 LOG_ERROR(logging::g_qSharedLogger,
"Model did not move to the expected device! Model is on: {}", model_device.str());
267 LOG_INFO(logging::g_qSharedLogger,
268 "Model successfully loaded and set to eval mode. The model is a {} model, and has {} classes.",
270 m_vClassLabels.size());
275 catch (
const c10::Error& trError)
277 LOG_ERROR(logging::g_qSharedLogger,
"Error loading model: {}", trError.what());
308 std::vector<Detection>
Inference(
const cv::Mat& cvInputFrame,
const float fMinObjectConfidence = 0.85,
const float fNMSThreshold = 0.6)
311 torch::set_num_threads(1);
313 std::vector<Detection> vObjects;
316 torch::Tensor trTensorImage =
PreprocessImage(cvInputFrame, m_trDevice);
319 std::vector<torch::jit::IValue> vInputs;
320 vInputs.push_back(trTensorImage);
321 torch::Tensor trOutputTensor;
324 trOutputTensor = m_trModel.forward(vInputs).toTensor();
326 catch (
const c10::Error& trError)
328 LOG_ERROR(logging::g_qSharedLogger,
"Error running inference: {}", trError.what());
333 int nImgSize = m_cvModelInputSize.
height;
334 int nP3Stride = std::pow((nImgSize / 8), 2);
335 int nP4Stride = std::pow((nImgSize / 16), 2);
336 int nP5Stride = std::pow((nImgSize / 32), 2);
338 int nYOLOv5AnchorsPerGridPoint = 3;
339 int nYOLOv8AnchorsPerGridPoint = 1;
340 int nYOLOv5TotalPredictionLength =
341 (nP3Stride * nYOLOv5AnchorsPerGridPoint) + (nP4Stride * nYOLOv5AnchorsPerGridPoint) + (nP5Stride * nYOLOv5AnchorsPerGridPoint);
342 int nYOLOv8TotalPredictionLength =
343 (nP3Stride * nYOLOv8AnchorsPerGridPoint) + (nP4Stride * nYOLOv8AnchorsPerGridPoint) + (nP5Stride * nYOLOv8AnchorsPerGridPoint);
346 std::vector<int> vClassIDs;
347 std::vector<std::string> vClassLabels;
348 std::vector<float> vClassConfidences;
349 std::vector<cv::Rect> vBoundingBoxes;
352 int nLargestDimension = *std::max_element(trOutputTensor.sizes().begin(), trOutputTensor.sizes().end());
354 if (nLargestDimension == nYOLOv5TotalPredictionLength)
357 this->
ParseTensorOutputYOLOv5(trOutputTensor, vClassIDs, vClassConfidences, vBoundingBoxes, cvInputFrame.
size(), fMinObjectConfidence);
360 else if (nLargestDimension == nYOLOv8TotalPredictionLength)
363 this->
ParseTensorOutputYOLOv8(trOutputTensor, vClassIDs, vClassConfidences, vBoundingBoxes, cvInputFrame.
size(), fMinObjectConfidence);
367 NonMaxSuppression(vObjects, vClassIDs, vClassConfidences, vBoundingBoxes, fMinObjectConfidence, fNMSThreshold);
370 for (
size_t nIter = 0; nIter < vObjects.size(); ++nIter)
373 if (vClassIDs[nIter] >= 0 && vClassIDs[nIter] <
static_cast<int>(m_vClassLabels.size()))
375 vObjects[nIter].szClassName = m_vClassLabels[vClassIDs[nIter]];
379 vObjects[nIter].szClassName =
"UnknownClass";
421 torch::Tensor trTensorImage = torch::from_blob(cvResizedImage.
data, {1, cvResizedImage.rows, cvResizedImage.cols, 3}, torch::kFloat);
422 trTensorImage = trTensorImage.permute({0, 3, 1, 2});
423 trTensorImage = trTensorImage.to(trDevice);
425 return trTensorImage;
443 std::vector<int>& vClassIDs,
444 std::vector<float>& vClassConfidences,
445 std::vector<cv::Rect>& vBoundingBoxes,
447 const float fMinObjectConfidence)
455 torch::Tensor trSqueezedOutput = trOutput.squeeze(0);
458 if (trSqueezedOutput.device().is_cuda())
460 trSqueezedOutput = trSqueezedOutput.to(torch::kCPU);
463 if (trSqueezedOutput.scalar_type() != torch::kFloat32)
465 trSqueezedOutput = trSqueezedOutput.to(torch::kFloat32);
468 if (!trSqueezedOutput.is_contiguous())
470 trSqueezedOutput = trSqueezedOutput.contiguous();
474 at::TensorAccessor trAccessor = trSqueezedOutput.accessor<float, 2>();
475 const int nNumDetections = trSqueezedOutput.size(0);
476 const int nTotalValues = trSqueezedOutput.size(1);
479 for (
int i = 0; i < nNumDetections; i++)
482 float fObjectnessConfidence = trAccessor[i][4];
485 if (fObjectnessConfidence < fMinObjectConfidence)
491 float fCenterX = trAccessor[i][0];
492 float fCenterY = trAccessor[i][1];
493 float fWidth = trAccessor[i][2];
494 float fHeight = trAccessor[i][3];
497 int nLeft =
static_cast<int>((fCenterX - (0.5 * fWidth)) * cvInputFrameSize.
width);
498 int nTop =
static_cast<int>((fCenterY - (0.5 * fHeight)) * cvInputFrameSize.
height);
499 int nBoundingWidth =
static_cast<int>(fWidth * cvInputFrameSize.
width);
500 int nBoundingHeight =
static_cast<int>(fHeight * cvInputFrameSize.
height);
503 cv::Rect cvBoundingBox(nLeft, nTop, nBoundingWidth, nBoundingHeight);
506 float fClassConfidence = -1.0f;
508 for (
int j = 5; j < nTotalValues; j++)
510 float fConfidence = trAccessor[i][j];
511 if (fConfidence > fClassConfidence)
513 fClassConfidence = fConfidence;
519 if (fClassConfidence < fMinObjectConfidence)
525 vClassIDs.emplace_back(nClassID);
526 vClassConfidences.emplace_back(fClassConfidence);
527 vBoundingBoxes.emplace_back(cvBoundingBox);
551 std::vector<int>& vClassIDs,
552 std::vector<float>& vClassConfidences,
553 std::vector<cv::Rect>& vBoundingBoxes,
555 const float fMinObjectConfidence)
566 torch::Tensor trPermuteOutput = trOutput.permute({0, 2, 1}).squeeze(0);
569 if (trPermuteOutput.device().is_cuda())
571 trPermuteOutput = trPermuteOutput.to(torch::kCPU);
574 if (trPermuteOutput.scalar_type() != torch::kFloat32)
576 trPermuteOutput = trPermuteOutput.to(torch::kFloat32);
579 if (!trPermuteOutput.is_contiguous())
581 trPermuteOutput = trPermuteOutput.contiguous();
585 at::TensorAccessor trAccessor = trPermuteOutput.accessor<float, 2>();
586 const int nNumDetections = trPermuteOutput.size(0);
587 const int nTotalValues = trPermuteOutput.size(1);
590 for (
int i = 0; i < nNumDetections; i++)
592 float fClassConfidence = -1.0f;
596 for (
int j = 4; j < nTotalValues; j++)
598 float fConfidence = trAccessor[i][j];
599 if (fConfidence > fClassConfidence)
601 fClassConfidence = fConfidence;
607 if (fClassConfidence < fMinObjectConfidence)
613 float fCenterX = trAccessor[i][0];
614 float fCenterY = trAccessor[i][1];
615 float fWidth = trAccessor[i][2];
616 float fHeight = trAccessor[i][3];
619 int nLeft =
static_cast<int>(fCenterX * cvInputFrameSize.
width / 640.0f - (0.5f * fWidth * cvInputFrameSize.
width / 640.0f));
620 int nTop =
static_cast<int>(fCenterY * cvInputFrameSize.
height / 640.0f - (0.5f * fHeight * cvInputFrameSize.
height / 640.0f));
621 int nBoxWidth =
static_cast<int>(fWidth * cvInputFrameSize.
width / 640.0f);
622 int nBoxHeight =
static_cast<int>(fHeight * cvInputFrameSize.
height / 640.0f);
623 cv::Rect cvBoundingBox(nLeft, nTop, nBoxWidth, nBoxHeight);
626 vClassIDs.push_back(nClassID);
627 vClassConfidences.push_back(fClassConfidence);
628 vBoundingBoxes.push_back(cvBoundingBox);
635 torch::jit::script::Module m_trModel;
636 torch::Device m_trDevice = torch::kCPU;
637 std::string m_szModelPath;
639 std::string m_szModelTask;
641 std::vector<std::string> m_vClassLabels;
void convertTo(OutputArray m, int rtype, double alpha=1, double beta=0) const
This class is designed to enable quick, easy, and robust inferencing of .pt yolo model.
Definition YOLOModel.hpp:163
void ParseTensorOutputYOLOv8(const torch::Tensor &trOutput, std::vector< int > &vClassIDs, std::vector< float > &vClassConfidences, std::vector< cv::Rect > &vBoundingBoxes, const cv::Size &cvInputFrameSize, const float fMinObjectConfidence)
Given a tensor output from a YOLOv5 model, parse it's output into something more usable.
Definition YOLOModel.hpp:550
bool IsReadyForInference() const
Check if the model is ready for inference.
Definition YOLOModel.hpp:395
std::vector< Detection > Inference(const cv::Mat &cvInputFrame, const float fMinObjectConfidence=0.85, const float fNMSThreshold=0.6)
Given an input image forward the image through the YOLO model to run inference on the PyTorch model,...
Definition YOLOModel.hpp:308
void ParseTensorOutputYOLOv5(const torch::Tensor &trOutput, std::vector< int > &vClassIDs, std::vector< float > &vClassConfidences, std::vector< cv::Rect > &vBoundingBoxes, const cv::Size &cvInputFrameSize, const float fMinObjectConfidence)
Given a tensor output from a YOLOv5 model, parse it's output into something more usable.
Definition YOLOModel.hpp:442
PyTorchInterpreter(std::string szModelPath, HardwareDevices eHardwareDevice=HardwareDevices::eCUDA)
Construct a new PyTorchInterpreter object.
Definition YOLOModel.hpp:187
~PyTorchInterpreter()
Destroy the PyTorchInterpreter object.
Definition YOLOModel.hpp:288
torch::Tensor PreprocessImage(const cv::Mat &cvInputFrame, const torch::Device &trDevice)
Given an input image, preprocess the image to match the input tensor shape of the model,...
Definition YOLOModel.hpp:413
void NMSBoxes(const std::vector< Rect > &bboxes, const std::vector< float > &scores, const float score_threshold, const float nms_threshold, std::vector< int > &indices, const float eta=1.f, const int top_k=0)
void cvtColor(InputArray src, OutputArray dst, int code, int dstCn=0)
void rectangle(InputOutputArray img, Point pt1, Point pt2, const Scalar &color, int thickness=1, int lineType=LINE_8, int shift=0)
void putText(InputOutputArray img, const String &text, Point org, int fontFace, double fontScale, Scalar color, int thickness=1, int lineType=LINE_8, bool bottomLeftOrigin=false)
Namespace containing functions or objects/struct used to aid in easy use of YOLO models....
Definition YOLOModel.hpp:34
void NonMaxSuppression(std::vector< Detection > &vObjects, std::vector< int > &vClassIDs, std::vector< float > &vClassConfidences, std::vector< cv::Rect > &vBoundingBoxes, float fMinObjectConfidence, float fNMSThreshold)
Perform non max suppression for the given predictions. This eliminates/combines predictions that over...
Definition YOLOModel.hpp:69
void DrawDetections(cv::Mat &cvInputFrame, std::vector< Detection > &vObjects)
Given an image and a vector of object structs, draw each object bounding box, class type,...
Definition YOLOModel.hpp:107
This struct is used to.
Definition YOLOModel.hpp:43