OpenShot Library | libopenshot  0.7.0
CVObjectDetection.cpp
Go to the documentation of this file.
1 
10 // Copyright (c) 2008-2019 OpenShot Studios, LLC
11 //
12 // SPDX-License-Identifier: LGPL-3.0-or-later
13 
14 #include <fstream>
15 #include <iomanip>
16 #include <iostream>
17 #include <algorithm>
18 
19 #include "CVObjectDetection.h"
20 #include "Exceptions.h"
21 
22 #include "objdetectdata.pb.h"
23 #include <google/protobuf/util/time_util.h>
24 
25 using namespace std;
26 using namespace openshot;
27 using google::protobuf::util::TimeUtil;
28 
29 namespace {
30 
31 bool LooksLikeTransposedYoloOutput(const cv::Mat& out, size_t classCount)
32 {
33  // YOLO26 segmentation exports without end-to-end postprocessing use
34  // [1, attributes, candidates], e.g. [1, 116, 8400]:
35  // 4 box channels + class scores + optional mask coefficients.
36  return out.dims == 3 && out.size[0] == 1 && out.size[1] >= 4 &&
37  out.size[2] > out.size[1] &&
38  (classCount == 0 || out.size[1] >= 4 + static_cast<int>(classCount));
39 }
40 
41 cv::Rect ScaledXYWHBox(
42  float centerX,
43  float centerY,
44  float width,
45  float height,
46  const cv::Size& frameDims,
47  int inputWidth,
48  int inputHeight)
49 {
50  if (centerX <= 1.0f && centerY <= 1.0f && width <= 1.0f && height <= 1.0f) {
51  centerX *= static_cast<float>(frameDims.width);
52  width *= static_cast<float>(frameDims.width);
53  centerY *= static_cast<float>(frameDims.height);
54  height *= static_cast<float>(frameDims.height);
55  } else {
56  const float xFactor = static_cast<float>(frameDims.width) / static_cast<float>(inputWidth);
57  const float yFactor = static_cast<float>(frameDims.height) / static_cast<float>(inputHeight);
58  centerX *= xFactor;
59  width *= xFactor;
60  centerY *= yFactor;
61  height *= yFactor;
62  }
63 
64  float left = centerX - width / 2.0f;
65  float top = centerY - height / 2.0f;
66  float right = centerX + width / 2.0f;
67  float bottom = centerY + height / 2.0f;
68 
69  left = std::max(0.0f, std::min(left, static_cast<float>(frameDims.width - 1)));
70  top = std::max(0.0f, std::min(top, static_cast<float>(frameDims.height - 1)));
71  right = std::max(0.0f, std::min(right, static_cast<float>(frameDims.width)));
72  bottom = std::max(0.0f, std::min(bottom, static_cast<float>(frameDims.height)));
73 
74  return cv::Rect(
75  static_cast<int>(left),
76  static_cast<int>(top),
77  std::max(0, static_cast<int>(right - left)),
78  std::max(0, static_cast<int>(bottom - top)));
79 }
80 
81 std::vector<uint32_t> EncodeBinaryMaskRLE(const std::vector<uint8_t>& mask)
82 {
83  std::vector<uint32_t> rle;
84  if (mask.empty())
85  return rle;
86 
87  uint8_t current = 0;
88  uint32_t count = 0;
89  for (uint8_t value : mask) {
90  value = value ? 1 : 0;
91  if (value == current) {
92  ++count;
93  } else {
94  rle.push_back(count);
95  current = value;
96  count = 1;
97  }
98  }
99  rle.push_back(count);
100  return rle;
101 }
102 
103 cv::Mat DecodeBinaryMaskRLE(const CVObjectMaskData& mask)
104 {
105  cv::Mat image(mask.height, mask.width, CV_8UC1, cv::Scalar(0));
106  if (!mask.HasData())
107  return image;
108 
109  const int total = mask.width * mask.height;
110  int offset = 0;
111  bool value = false;
112  uint8_t* data = image.ptr<uint8_t>();
113  for (uint32_t count : mask.rle) {
114  const int end = std::min(total, offset + static_cast<int>(count));
115  if (value) {
116  std::fill(data + offset, data + end, static_cast<uint8_t>(1));
117  }
118  offset = end;
119  value = !value;
120  if (offset >= total)
121  break;
122  }
123  return image;
124 }
125 
126 CVObjectMaskData TransformMaskToBox(
127  const CVObjectMaskData& sourceMask,
128  const cv::Rect_<float>& sourceBox,
129  const cv::Rect_<float>& targetBox,
130  const cv::Size& frameDims)
131 {
132  CVObjectMaskData result;
133  if (!sourceMask.HasData() || sourceBox.width <= 0.0f || sourceBox.height <= 0.0f ||
134  targetBox.width <= 0.0f || targetBox.height <= 0.0f ||
135  frameDims.width <= 0 || frameDims.height <= 0) {
136  return result;
137  }
138 
139  const float scaleX = sourceMask.width / static_cast<float>(frameDims.width);
140  const float scaleY = sourceMask.height / static_cast<float>(frameDims.height);
141  const cv::Rect_<float> sourceMaskBox(
142  sourceBox.x * scaleX,
143  sourceBox.y * scaleY,
144  sourceBox.width * scaleX,
145  sourceBox.height * scaleY);
146  const cv::Rect_<float> targetMaskBox(
147  targetBox.x * scaleX,
148  targetBox.y * scaleY,
149  targetBox.width * scaleX,
150  targetBox.height * scaleY);
151  if (sourceMaskBox.width <= 0.0f || sourceMaskBox.height <= 0.0f)
152  return result;
153 
154  const double xScale = targetMaskBox.width / sourceMaskBox.width;
155  const double yScale = targetMaskBox.height / sourceMaskBox.height;
156  cv::Mat transform = (cv::Mat_<double>(2, 3) <<
157  xScale, 0.0, targetMaskBox.x - xScale * sourceMaskBox.x,
158  0.0, yScale, targetMaskBox.y - yScale * sourceMaskBox.y);
159 
160  cv::Mat source = DecodeBinaryMaskRLE(sourceMask);
161  cv::Mat transformed;
162  cv::warpAffine(
163  source, transformed, transform, source.size(),
164  cv::INTER_NEAREST, cv::BORDER_CONSTANT, cv::Scalar(0));
165  if (cv::countNonZero(transformed) == 0)
166  return result;
167 
168  result.width = sourceMask.width;
169  result.height = sourceMask.height;
170  result.rle = EncodeBinaryMaskRLE(
171  std::vector<uint8_t>(transformed.data, transformed.data + transformed.total()));
172  return result;
173 }
174 
175 CVObjectMaskData BuildMaskFromPrototype(
176  const cv::Mat& prototype,
177  const std::vector<float>& coefficients,
178  const cv::Rect& box,
179  const cv::Size& frameDims)
180 {
181  CVObjectMaskData result;
182  if (prototype.dims != 4 || prototype.size[0] != 1 ||
183  prototype.size[1] != static_cast<int>(coefficients.size()))
184  return result;
185 
186  const int channels = prototype.size[1];
187  const int maskHeight = prototype.size[2];
188  const int maskWidth = prototype.size[3];
189  const int maskPixels = maskWidth * maskHeight;
190  const float* protoData = reinterpret_cast<const float*>(prototype.data);
191 
192  const int left = std::max(0, static_cast<int>(box.x * maskWidth / static_cast<float>(frameDims.width)));
193  const int top = std::max(0, static_cast<int>(box.y * maskHeight / static_cast<float>(frameDims.height)));
194  const int right = std::min(maskWidth, static_cast<int>((box.x + box.width) * maskWidth / static_cast<float>(frameDims.width)));
195  const int bottom = std::min(maskHeight, static_cast<int>((box.y + box.height) * maskHeight / static_cast<float>(frameDims.height)));
196  if (left >= right || top >= bottom)
197  return result;
198 
199  std::vector<uint8_t> binary(maskPixels, 0);
200  for (int y = top; y < bottom; ++y) {
201  for (int x = left; x < right; ++x) {
202  const int pixel = y * maskWidth + x;
203  float value = 0.0f;
204  for (int channel = 0; channel < channels; ++channel) {
205  value += coefficients[channel] * protoData[channel * maskPixels + pixel];
206  }
207  binary[pixel] = value > 0.0f ? 1 : 0;
208  }
209  }
210 
211  result.width = maskWidth;
212  result.height = maskHeight;
213  result.rle = EncodeBinaryMaskRLE(binary);
214  return result;
215 }
216 
217 std::string LoadONNXModel(std::string modelPath, cv::dnn::Net *net)
218 {
219 #if CV_VERSION_MAJOR < 4 || (CV_VERSION_MAJOR == 4 && CV_VERSION_MINOR < 3)
220  return std::string("Failed to load ONNX model: YOLO requires OpenCV 4.3.0 or newer. "
221  "This OpenCV build is ") + CV_VERSION + ".";
222 #else
223  try {
224  cv::dnn::Net loaded_net = cv::dnn::readNetFromONNX(modelPath);
225  if (net) {
226  *net = loaded_net;
227  }
228  return "";
229  } catch (const cv::Exception& e) {
230  std::string error_text = std::string("Failed to load ONNX model: ") + e.what();
231  if (error_text.find("Unsupported data type: FLOAT16") != std::string::npos) {
232  error_text = "Failed to load ONNX model: FLOAT16 is not supported by this OpenCV build. "
233  "Please use an FP32 ONNX model.";
234  }
235  return error_text;
236  } catch (const std::exception& e) {
237  return std::string("Failed to load ONNX model: ") + e.what();
238  } catch (...) {
239  return "Failed to load ONNX model: unknown error";
240  }
241 #endif
242 }
243 
244 }
245 
246 CVObjectDetection::CVObjectDetection(std::string processInfoJson, ProcessingController &processingController)
247 : processingController(&processingController), processingDevice("CPU"), inpWidth(640), inpHeight(640), generateMasks(true){
248  confThreshold = 0.10;
249  nmsThreshold = 0.1;
250  SetJson(processInfoJson);
251 }
252 
253 std::string CVObjectDetection::ValidateONNXModel(std::string modelPath)
254 {
255  return LoadONNXModel(modelPath, nullptr);
256 }
257 
258 void CVObjectDetection::setProcessingDevice(){
259  if(processingDevice == "GPU"){
260  try {
261  const std::vector<cv::dnn::Target> targets = cv::dnn::getAvailableTargets(cv::dnn::DNN_BACKEND_CUDA);
262  if (std::find(targets.begin(), targets.end(), cv::dnn::DNN_TARGET_CUDA) != targets.end()) {
263  net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
264  net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);
265  return;
266  }
267  } catch (const cv::Exception&) {
268  }
269  processingDevice = "CPU";
270  }
271 
272  if(processingDevice == "CPU"){
273  net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
274  net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
275  }
276 }
277 
278 void CVObjectDetection::detectObjectsClip(openshot::Clip &video, size_t _start, size_t _end, bool process_interval)
279 {
280 
281  start = _start; end = _end;
282 
283  video.Open();
284 
285  if(error){
286  return;
287  }
288 
289  processingController->SetError(false, "");
290 
291  if(modelPath.empty()) {
292  processingController->SetError(true, "Missing path to YOLO ONNX model file");
293  error = true;
294  return;
295  }
296  if(classesFile.empty()) {
297  processingController->SetError(true, "Missing path to class name file");
298  error = true;
299  return;
300  }
301 
302  std::ifstream model_file(modelPath);
303  if(!model_file.good()){
304  processingController->SetError(true, "Incorrect path to YOLO ONNX model file");
305  error = true;
306  return;
307  }
308  std::ifstream classes_file(classesFile);
309  if(!classes_file.good()){
310  processingController->SetError(true, "Incorrect path to class name file");
311  error = true;
312  return;
313  }
314 
315  // Load names of classes
316  classNames.clear();
317  std::string line;
318  while (std::getline(classes_file, line)) classNames.push_back(line);
319 
320  // Load the network
321  std::string error_text = LoadONNXModel(modelPath, &net);
322  if (!error_text.empty()) {
323  processingController->SetError(true, error_text);
324  error = true;
325  return;
326  }
327  setProcessingDevice();
328 
329  size_t frame_number;
330  if(!process_interval || end <= 1 || end-start == 0){
331  // Get total number of frames in video
332  start = (int)(video.Start() * video.Reader()->info.fps.ToFloat());
333  end = (int)(video.End() * video.Reader()->info.fps.ToFloat());
334  }
335 
336  for (frame_number = start; frame_number <= end; frame_number++)
337  {
338  // Stop the feature tracker process
339  if(processingController->ShouldStop()){
340  return;
341  }
342 
343  std::shared_ptr<openshot::Frame> f = video.GetFrame(frame_number);
344 
345  // Grab OpenCV Mat image
346  cv::Mat cvimage = f->GetImageCV();
347 
348  DetectObjects(cvimage, frame_number);
349 
350  // Update progress
351  processingController->SetProgress(uint(100*(frame_number-start)/(end-start)));
352 
353  }
354 }
355 
356 void CVObjectDetection::DetectObjects(const cv::Mat &frame, size_t frameId){
357  // Get frame as OpenCV Mat
358  cv::Mat blob;
359 
360  // Create a 4D blob from the frame.
361  cv::dnn::blobFromImage(frame, blob, 1/255.0, cv::Size(inpWidth, inpHeight), cv::Scalar(0,0,0), true, false);
362 
363  std::vector<cv::Mat> outs;
364  try {
365  // Sets the input to the network
366  net.setInput(blob);
367  // Runs the forward pass to get output of the output layers
368  net.forward(outs, getOutputsNames(net));
369  } catch (const cv::Exception& e) {
370  processingController->SetError(true, std::string("Object detection inference failed: ") + e.what());
371  error = true;
372  return;
373  }
374 
375  // Remove the bounding boxes with low confidence
376  postprocess(frame.size(), outs, frameId);
377 
378 }
379 
380 
381 // Remove the bounding boxes with low confidence using non-maxima suppression
382 void CVObjectDetection::postprocess(const cv::Size &frameDims, const std::vector<cv::Mat>& outs, size_t frameId)
383 {
384  std::vector<int> classIds;
385  std::vector<float> confidences;
386  std::vector<cv::Rect> boxes;
387  std::vector<std::vector<ClassScore>> detectionClassScores;
388  std::vector<CVObjectMaskData> detectionMasks;
389  std::vector<int> objectIds;
390  const int maxClassCandidates = 5;
391 
392  for (size_t i = 0; i < outs.size(); ++i) {
393  cv::Mat det = outs[i];
394 
395  if (LooksLikeTransposedYoloOutput(det, classNames.size())) {
396  const int attributes = det.size[1];
397  const int candidates = det.size[2];
398  const int classCount = !classNames.empty()
399  ? static_cast<int>(classNames.size())
400  : attributes - 4;
401  const int maskCoefficientCount = attributes - 4 - classCount;
402  const cv::Mat* prototype = nullptr;
403  if (generateMasks && maskCoefficientCount > 0) {
404  auto prototypeIt = std::find_if(outs.begin(), outs.end(),
405  [maskCoefficientCount](const cv::Mat& out) {
406  return out.dims == 4 && out.size[0] == 1 && out.size[1] == maskCoefficientCount;
407  });
408  if (prototypeIt != outs.end()) {
409  prototype = &(*prototypeIt);
410  }
411  }
412  const float* data = reinterpret_cast<const float*>(det.data);
413 
414  for (int candidateIndex = 0; candidateIndex < candidates; ++candidateIndex) {
415  std::vector<ClassScore> rowClassScores;
416  rowClassScores.reserve(maxClassCandidates);
417 
418  for (int classIndex = 0; classIndex < classCount; ++classIndex) {
419  const float classConfidence = data[(4 + classIndex) * candidates + candidateIndex];
420  if (rowClassScores.size() < static_cast<size_t>(maxClassCandidates)) {
421  rowClassScores.emplace_back(classIndex, classConfidence);
422  std::sort(rowClassScores.begin(), rowClassScores.end(),
423  [](const ClassScore& a, const ClassScore& b) { return a.score > b.score; });
424  } else if (classConfidence > rowClassScores.back().score) {
425  rowClassScores.back() = ClassScore(classIndex, classConfidence);
426  std::sort(rowClassScores.begin(), rowClassScores.end(),
427  [](const ClassScore& a, const ClassScore& b) { return a.score > b.score; });
428  }
429  }
430 
431  if (rowClassScores.empty() || rowClassScores.front().score <= confThreshold) {
432  continue;
433  }
434 
435  cv::Rect box = ScaledXYWHBox(
436  data[candidateIndex],
437  data[candidates + candidateIndex],
438  data[2 * candidates + candidateIndex],
439  data[3 * candidates + candidateIndex],
440  frameDims, inpWidth, inpHeight);
441  if (box.width <= 0 || box.height <= 0) {
442  continue;
443  }
444 
445  classIds.push_back(rowClassScores.front().classId);
446  confidences.push_back(rowClassScores.front().score);
447  boxes.push_back(box);
448  detectionClassScores.push_back(rowClassScores);
449  if (prototype) {
450  std::vector<float> coefficients;
451  coefficients.reserve(maskCoefficientCount);
452  for (int coefficientIndex = 0; coefficientIndex < maskCoefficientCount; ++coefficientIndex) {
453  coefficients.push_back(data[(4 + classCount + coefficientIndex) * candidates + candidateIndex]);
454  }
455  detectionMasks.push_back(BuildMaskFromPrototype(*prototype, coefficients, box, frameDims));
456  } else {
457  detectionMasks.push_back({});
458  }
459  }
460  continue;
461  }
462 
463  // YOLOv5-style ONNX output is usually [1, num_boxes, num_classes + 5].
464  if (det.dims == 3) {
465  det = det.reshape(1, det.size[1]);
466  }
467  if (det.dims != 2 || det.cols < 6) {
468  continue;
469  }
470 
471  const float xFactor = static_cast<float>(frameDims.width) / static_cast<float>(inpWidth);
472  const float yFactor = static_cast<float>(frameDims.height) / static_cast<float>(inpHeight);
473 
474  float* data = reinterpret_cast<float*>(det.data);
475  for (int j = 0; j < det.rows; ++j, data += det.cols) {
476  std::vector<ClassScore> rowClassScores;
477  rowClassScores.reserve(maxClassCandidates);
478  int classScoresEnd = det.cols;
479  if (!classNames.empty()) {
480  classScoresEnd = std::min(det.cols, 5 + static_cast<int>(classNames.size()));
481  }
482  for (int classIndex = 5; classIndex < classScoresEnd; ++classIndex) {
483  const float classConfidence = data[classIndex] * data[4];
484  if (rowClassScores.size() < static_cast<size_t>(maxClassCandidates)) {
485  rowClassScores.emplace_back(classIndex - 5, classConfidence);
486  std::sort(rowClassScores.begin(), rowClassScores.end(),
487  [](const ClassScore& a, const ClassScore& b) { return a.score > b.score; });
488  } else if (classConfidence > rowClassScores.back().score) {
489  rowClassScores.back() = ClassScore(classIndex - 5, classConfidence);
490  std::sort(rowClassScores.begin(), rowClassScores.end(),
491  [](const ClassScore& a, const ClassScore& b) { return a.score > b.score; });
492  }
493  }
494  if (rowClassScores.empty()) {
495  continue;
496  }
497 
498  float confidence = rowClassScores.front().score;
499 
500  if (confidence > confThreshold) {
501  int centerX = 0;
502  int centerY = 0;
503  int width = 0;
504  int height = 0;
505 
506  if (data[0] > 1.0f || data[1] > 1.0f || data[2] > 1.0f || data[3] > 1.0f) {
507  centerX = static_cast<int>(data[0] * xFactor);
508  centerY = static_cast<int>(data[1] * yFactor);
509  width = static_cast<int>(data[2] * xFactor);
510  height = static_cast<int>(data[3] * yFactor);
511  } else {
512  centerX = static_cast<int>(data[0] * frameDims.width);
513  centerY = static_cast<int>(data[1] * frameDims.height);
514  width = static_cast<int>(data[2] * frameDims.width);
515  height = static_cast<int>(data[3] * frameDims.height);
516  }
517 
518  int left = centerX - width / 2;
519  int top = centerY - height / 2;
520 
521  classIds.push_back(rowClassScores.front().classId);
522  confidences.push_back(confidence);
523  boxes.push_back(cv::Rect(left, top, width, height));
524  detectionClassScores.push_back(rowClassScores);
525  detectionMasks.push_back({});
526  }
527  }
528  }
529 
530  // Perform non maximum suppression to eliminate redundant overlapping boxes with
531  // lower confidences
532  std::vector<int> indices;
533  cv::dnn::NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
534 
535  // Pass boxes to SORT algorithm
536  std::vector<cv::Rect> sortBoxes;
537  std::vector<float> sortConfidences;
538  std::vector<int> sortClassIds;
539  std::vector<std::vector<ClassScore>> sortClassScores;
540  std::vector<CVObjectMaskData> sortMasks;
541  for(auto index : indices) {
542  sortBoxes.push_back(boxes[index]);
543  sortConfidences.push_back(confidences[index]);
544  sortClassIds.push_back(classIds[index]);
545  sortClassScores.push_back(detectionClassScores[index]);
546  sortMasks.push_back(index < static_cast<int>(detectionMasks.size()) ? detectionMasks[index] : CVObjectMaskData());
547  }
548  sort.update(sortBoxes, frameId, sqrt(pow(frameDims.width,2) + pow(frameDims.height, 2)), sortConfidences, sortClassIds, sortClassScores);
549 
550  // Clear data vectors
551  boxes.clear(); confidences.clear(); classIds.clear(); objectIds.clear();
552  std::vector<CVObjectMaskData> masks;
553  // Get SORT predicted boxes
554  for(auto TBox : sort.frameTrackingResult){
555  if(TBox.frame == frameId){
556  boxes.push_back(TBox.box);
557  confidences.push_back(TBox.confidence);
558  classIds.push_back(TBox.classId);
559  objectIds.push_back(TBox.id);
560  CVObjectMaskData mask;
561  double bestIoU = 0.0;
562  for (size_t maskIndex = 0; maskIndex < sortMasks.size(); ++maskIndex) {
563  if (!sortMasks[maskIndex].HasData() || sortClassIds[maskIndex] != TBox.classId)
564  continue;
565  double score = SortTracker::GetIOU(cv::Rect_<float>(sortBoxes[maskIndex]), TBox.box);
566  if (score > bestIoU) {
567  bestIoU = score;
568  mask = sortMasks[maskIndex];
569  }
570  }
571  if (mask.HasData()) {
572  recentObjectMasks[TBox.id] = CVTrackedMaskData{frameId, mask, TBox.box};
573  } else {
574  const auto recentMask = recentObjectMasks.find(TBox.id);
575  if (recentMask != recentObjectMasks.end() &&
576  frameId > recentMask->second.frameId &&
577  frameId - recentMask->second.frameId <= 5) {
578  mask = TransformMaskToBox(
579  recentMask->second.mask,
580  recentMask->second.box,
581  TBox.box,
582  frameDims);
583  if (mask.HasData()) {
584  recentObjectMasks[TBox.id] = CVTrackedMaskData{frameId, mask, TBox.box};
585  }
586  }
587  }
588  masks.push_back(mask);
589  }
590  }
591 
592  // Remove boxes based on controids distance
593  for(uint i = 0; i<boxes.size(); i++){
594  for(uint j = i+1; j<boxes.size(); j++){
595  int xc_1 = boxes[i].x + (int)(boxes[i].width/2), yc_1 = boxes[i].y + (int)(boxes[i].height/2);
596  int xc_2 = boxes[j].x + (int)(boxes[j].width/2), yc_2 = boxes[j].y + (int)(boxes[j].height/2);
597 
598  if(fabs(xc_1 - xc_2) < 10 && fabs(yc_1 - yc_2) < 10){
599  if(classIds[i] == classIds[j]){
600  if(confidences[i] >= confidences[j]){
601  boxes.erase(boxes.begin() + j);
602  classIds.erase(classIds.begin() + j);
603  confidences.erase(confidences.begin() + j);
604  objectIds.erase(objectIds.begin() + j);
605  masks.erase(masks.begin() + j);
606  break;
607  }
608  else{
609  boxes.erase(boxes.begin() + i);
610  classIds.erase(classIds.begin() + i);
611  confidences.erase(confidences.begin() + i);
612  objectIds.erase(objectIds.begin() + i);
613  masks.erase(masks.begin() + i);
614  i = 0;
615  break;
616  }
617  }
618  }
619  }
620  }
621 
622  // Remove boxes based in IOU score
623  for(uint i = 0; i<boxes.size(); i++){
624  for(uint j = i+1; j<boxes.size(); j++){
625 
626  if( iou(boxes[i], boxes[j])){
627  if(classIds[i] == classIds[j]){
628  if(confidences[i] >= confidences[j]){
629  boxes.erase(boxes.begin() + j);
630  classIds.erase(classIds.begin() + j);
631  confidences.erase(confidences.begin() + j);
632  objectIds.erase(objectIds.begin() + j);
633  masks.erase(masks.begin() + j);
634  break;
635  }
636  else{
637  boxes.erase(boxes.begin() + i);
638  classIds.erase(classIds.begin() + i);
639  confidences.erase(confidences.begin() + i);
640  objectIds.erase(objectIds.begin() + i);
641  masks.erase(masks.begin() + i);
642  i = 0;
643  break;
644  }
645  }
646  }
647  }
648  }
649 
650  // Normalize boxes coordinates
651  std::vector<cv::Rect_<float>> normalized_boxes;
652  for(auto box : boxes){
653  cv::Rect_<float> normalized_box;
654  normalized_box.x = (box.x)/(float)frameDims.width;
655  normalized_box.y = (box.y)/(float)frameDims.height;
656  normalized_box.width = (box.width)/(float)frameDims.width;
657  normalized_box.height = (box.height)/(float)frameDims.height;
658  normalized_boxes.push_back(normalized_box);
659  }
660 
661  detectionsData[frameId] = CVDetectionData(classIds, confidences, normalized_boxes, frameId, objectIds, masks);
662 }
663 
664 // Compute IOU between 2 boxes
665 bool CVObjectDetection::iou(cv::Rect pred_box, cv::Rect sort_box){
666  // Determine the (x, y)-coordinates of the intersection rectangle
667  int xA = std::max(pred_box.x, sort_box.x);
668  int yA = std::max(pred_box.y, sort_box.y);
669  int xB = std::min(pred_box.x + pred_box.width, sort_box.x + sort_box.width);
670  int yB = std::min(pred_box.y + pred_box.height, sort_box.y + sort_box.height);
671 
672  // Compute the area of intersection rectangle
673  int interArea = std::max(0, xB - xA + 1) * std::max(0, yB - yA + 1);
674 
675  // Compute the area of both the prediction and ground-truth rectangles
676  int boxAArea = (pred_box.width + 1) * (pred_box.height + 1);
677  int boxBArea = (sort_box.width + 1) * (sort_box.height + 1);
678 
679  // Compute the intersection over union by taking the intersection
680  float iou = interArea / (float)(boxAArea + boxBArea - interArea);
681 
682  // If IOU is above this value the boxes are very close (probably a variation of the same bounding box)
683  if(iou > 0.5)
684  return true;
685  return false;
686 }
687 
688 // Get the names of the output layers
689 std::vector<cv::String> CVObjectDetection::getOutputsNames(const cv::dnn::Net& net)
690 {
691  //Get the indices of the output layers, i.e. the layers with unconnected outputs
692  std::vector<int> outLayers = net.getUnconnectedOutLayers();
693 
694  //get the names of all the layers in the network
695  std::vector<cv::String> layersNames = net.getLayerNames();
696 
697  // Get the names of the output layers in names
698  std::vector<cv::String> names;
699  names.resize(outLayers.size());
700  for (size_t i = 0; i < outLayers.size(); ++i)
701  names[i] = layersNames[outLayers[i] - 1];
702  return names;
703 }
704 
706  // Check if the stabilizer info for the requested frame exists
707  if ( detectionsData.find(frameId) == detectionsData.end() ) {
708 
709  return CVDetectionData();
710  } else {
711 
712  return detectionsData[frameId];
713  }
714 }
715 
716 void CVObjectDetection::NormalizeTrackedClasses()
717 {
718  struct ClassEvidence {
719  float confidenceSum = 0.0f;
720  size_t count = 0;
721  };
722 
723  std::map<int, std::map<int, ClassEvidence>> objectClassEvidence;
724  for (const auto& frameData : detectionsData) {
725  const CVDetectionData& detections = frameData.second;
726  const size_t detectionCount = std::min(detections.objectIds.size(), detections.classIds.size());
727  for (size_t i = 0; i < detectionCount; ++i) {
728  const float confidence = i < detections.confidences.size() ? detections.confidences[i] : 1.0f;
729  ClassEvidence& evidence = objectClassEvidence[detections.objectIds[i]][detections.classIds[i]];
730  evidence.confidenceSum += confidence;
731  ++evidence.count;
732  }
733  }
734 
735  std::map<int, int> dominantClassByObject;
736  for (const auto& objectEvidence : objectClassEvidence) {
737  const int objectId = objectEvidence.first;
738  int bestClassId = -1;
739  ClassEvidence bestEvidence;
740  for (const auto& classEvidence : objectEvidence.second) {
741  const int classId = classEvidence.first;
742  const ClassEvidence& evidence = classEvidence.second;
743  if (bestClassId < 0 ||
744  evidence.confidenceSum > bestEvidence.confidenceSum ||
745  (evidence.confidenceSum == bestEvidence.confidenceSum && evidence.count > bestEvidence.count)) {
746  bestClassId = classId;
747  bestEvidence = evidence;
748  }
749  }
750  if (bestClassId >= 0) {
751  dominantClassByObject[objectId] = bestClassId;
752  }
753  }
754 
755  for (auto& frameData : detectionsData) {
756  CVDetectionData& detections = frameData.second;
757  const size_t detectionCount = std::min(detections.objectIds.size(), detections.classIds.size());
758  for (size_t i = 0; i < detectionCount; ++i) {
759  const auto dominantClass = dominantClassByObject.find(detections.objectIds[i]);
760  if (dominantClass != dominantClassByObject.end()) {
761  detections.classIds[i] = dominantClass->second;
762  }
763  }
764  }
765 }
766 
768  if(protobuf_data_path.empty()) {
769  cerr << "Missing path to object detection protobuf data file." << endl;
770  return false;
771  }
772 
773  NormalizeTrackedClasses();
774 
775  // Create tracker message
776  pb_objdetect::ObjDetect objMessage;
777 
778  //Save class names in protobuf message
779  for(int i = 0; i<classNames.size(); i++){
780  std::string* className = objMessage.add_classnames();
781  className->assign(classNames.at(i));
782  }
783 
784  // Iterate over all frames data and save in protobuf message
785  for(std::map<size_t,CVDetectionData>::iterator it=detectionsData.begin(); it!=detectionsData.end(); ++it){
786  CVDetectionData dData = it->second;
787  AddFrameDataToProto(objMessage.add_frame(), dData);
788  }
789 
790  // Add timestamp
791  *objMessage.mutable_last_updated() = TimeUtil::SecondsToTimestamp(time(NULL));
792 
793  {
794  // Write the new message to disk.
795  std::fstream output(protobuf_data_path, ios::out | ios::trunc | ios::binary);
796  if (!objMessage.SerializeToOstream(&output)) {
797  cerr << "Failed to write protobuf message." << endl;
798  return false;
799  }
800  }
801 
802  // Delete all global objects allocated by libprotobuf.
803  google::protobuf::ShutdownProtobufLibrary();
804 
805  return true;
806 
807 }
808 
809 // Add frame object detection into protobuf message.
810 void CVObjectDetection::AddFrameDataToProto(pb_objdetect::Frame* pbFrameData, CVDetectionData& dData) {
811 
812  // Save frame number and rotation
813  pbFrameData->set_id(dData.frameId);
814 
815  for(size_t i = 0; i < dData.boxes.size(); i++){
816  pb_objdetect::Frame_Box* box = pbFrameData->add_bounding_box();
817 
818  // Save bounding box data
819  box->set_x(dData.boxes.at(i).x);
820  box->set_y(dData.boxes.at(i).y);
821  box->set_w(dData.boxes.at(i).width);
822  box->set_h(dData.boxes.at(i).height);
823  box->set_classid(dData.classIds.at(i));
824  box->set_confidence(dData.confidences.at(i));
825  box->set_objectid(dData.objectIds.at(i));
826 
827  if (i < dData.masks.size() && dData.masks.at(i).HasData()) {
828  pb_objdetect::Frame_Box_Mask* mask = box->mutable_mask();
829  mask->set_width(dData.masks.at(i).width);
830  mask->set_height(dData.masks.at(i).height);
831  for (uint32_t count : dData.masks.at(i).rle) {
832  mask->add_rle(count);
833  }
834  }
835  }
836 }
837 
838 // Load JSON string into this object
839 void CVObjectDetection::SetJson(const std::string value) {
840  // Parse JSON string into JSON objects
841  try
842  {
843  const Json::Value root = openshot::stringToJson(value);
844  // Set all values that match
845 
846  SetJsonValue(root);
847  }
848  catch (const std::exception& e)
849  {
850  // Error parsing JSON (or missing keys)
851  // throw InvalidJSON("JSON is invalid (missing keys or invalid data types)");
852  std::cout<<"JSON is invalid (missing keys or invalid data types)"<<std::endl;
853  }
854 }
855 
856 // Load Json::Value into this object
857 void CVObjectDetection::SetJsonValue(const Json::Value root) {
858 
859  // Set data from Json (if key is found)
860  if (!root["protobuf_data_path"].isNull()){
861  protobuf_data_path = (root["protobuf_data_path"].asString());
862  }
863 
864  if (!root["processing-device"].isNull()){
865  processingDevice = (root["processing-device"].asString());
866  }
867  if (!root["processing_device"].isNull()){
868  processingDevice = (root["processing_device"].asString());
869  }
870  if (!root["class-names"].isNull()){
871  classesFile = (root["class-names"].asString());
872  }
873  if (!root["classes_file"].isNull()){
874  classesFile = (root["classes_file"].asString());
875  }
876  if (!root["model"].isNull()){
877  modelPath = (root["model"].asString());
878  }
879  if (!root["model_path"].isNull()){
880  modelPath = (root["model_path"].asString());
881  }
882  if (!root["input-width"].isNull()){
883  inpWidth = root["input-width"].asInt();
884  }
885  if (!root["input_width"].isNull()){
886  inpWidth = root["input_width"].asInt();
887  }
888  if (!root["input-height"].isNull()){
889  inpHeight = root["input-height"].asInt();
890  }
891  if (!root["input_height"].isNull()){
892  inpHeight = root["input_height"].asInt();
893  }
894  if (!root["confidence-threshold"].isNull()){
895  confThreshold = root["confidence-threshold"].asFloat();
896  }
897  if (!root["confidence_threshold"].isNull()){
898  confThreshold = root["confidence_threshold"].asFloat();
899  }
900  if (!root["nms-threshold"].isNull()){
901  nmsThreshold = root["nms-threshold"].asFloat();
902  }
903  if (!root["nms_threshold"].isNull()){
904  nmsThreshold = root["nms_threshold"].asFloat();
905  }
906  if (!root["generate-masks"].isNull()){
907  generateMasks = root["generate-masks"].asBool();
908  }
909  if (!root["generate_masks"].isNull()){
910  generateMasks = root["generate_masks"].asBool();
911  }
912 }
913 
914 /*
915 ||||||||||||||||||||||||||||||||||||||||||||||||||
916  ONLY FOR MAKE TEST
917 ||||||||||||||||||||||||||||||||||||||||||||||||||
918 */
919 
920 // Load protobuf data file
922  if(protobuf_data_path.empty()) {
923  cerr << "Missing path to object detection protobuf data file." << endl;
924  return false;
925  }
926 
927  // Create tracker message
928  pb_objdetect::ObjDetect objMessage;
929 
930  {
931  // Read the existing tracker message.
932  fstream input(protobuf_data_path, ios::in | ios::binary);
933  if (!objMessage.ParseFromIstream(&input)) {
934  cerr << "Failed to parse protobuf message." << endl;
935  return false;
936  }
937  }
938 
939  // Make sure classNames and detectionsData are empty
940  classNames.clear(); detectionsData.clear();
941 
942  // Get all classes names and assign a color to them
943  for(int i = 0; i < objMessage.classnames_size(); i++){
944  classNames.push_back(objMessage.classnames(i));
945  }
946 
947  // Iterate over all frames of the saved message
948  for (size_t i = 0; i < objMessage.frame_size(); i++) {
949  // Create protobuf message reader
950  const pb_objdetect::Frame& pbFrameData = objMessage.frame(i);
951 
952  // Get frame Id
953  size_t id = pbFrameData.id();
954 
955  // Load bounding box data
956  const google::protobuf::RepeatedPtrField<pb_objdetect::Frame_Box > &pBox = pbFrameData.bounding_box();
957 
958  // Construct data vectors related to detections in the current frame
959  std::vector<int> classIds;
960  std::vector<float> confidences;
961  std::vector<cv::Rect_<float>> boxes;
962  std::vector<int> objectIds;
963  std::vector<CVObjectMaskData> masks;
964 
965  for(int i = 0; i < pbFrameData.bounding_box_size(); i++){
966  // Get bounding box coordinates
967  float x = pBox.Get(i).x(); float y = pBox.Get(i).y();
968  float w = pBox.Get(i).w(); float h = pBox.Get(i).h();
969  // Create OpenCV rectangle with the bouding box info
970  cv::Rect_<float> box(x, y, w, h);
971 
972  // Get class Id (which will be assign to a class name) and prediction confidence
973  int classId = pBox.Get(i).classid(); float confidence = pBox.Get(i).confidence();
974  // Get object Id
975  int objectId = pBox.Get(i).objectid();
976 
977  // Push back data into vectors
978  boxes.push_back(box); classIds.push_back(classId); confidences.push_back(confidence);
979  objectIds.push_back(objectId);
980  CVObjectMaskData mask;
981  if (pBox.Get(i).has_mask()) {
982  mask.width = pBox.Get(i).mask().width();
983  mask.height = pBox.Get(i).mask().height();
984  for (int rleIndex = 0; rleIndex < pBox.Get(i).mask().rle_size(); ++rleIndex) {
985  mask.rle.push_back(pBox.Get(i).mask().rle(rleIndex));
986  }
987  }
988  masks.push_back(mask);
989  }
990 
991  // Assign data to object detector map
992  detectionsData[id] = CVDetectionData(classIds, confidences, boxes, id, objectIds, masks);
993  }
994 
995  // Delete all global objects allocated by libprotobuf.
996  google::protobuf::ShutdownProtobufLibrary();
997 
998  return true;
999 }
openshot::stringToJson
const Json::Value stringToJson(const std::string value)
Definition: Json.cpp:16
openshot::Clip::Open
void Open() override
Open the internal reader.
Definition: Clip.cpp:387
CVObjectDetection.h
Header file for CVObjectDetection class.
openshot::CVObjectMaskData::width
int width
Definition: CVObjectDetection.h:37
ProcessingController::ShouldStop
bool ShouldStop()
Definition: ProcessingController.h:68
ProcessingController::SetError
void SetError(bool err, std::string message)
Definition: ProcessingController.h:74
openshot
This namespace is the default namespace for all code in the openshot library.
Definition: AnimatedCurve.h:24
openshot::CVDetectionData::classIds
std::vector< int > classIds
Definition: CVObjectDetection.h:69
openshot::Clip
This class represents a clip (used to arrange readers on the timeline)
Definition: Clip.h:89
openshot::CVDetectionData::masks
std::vector< CVObjectMaskData > masks
Definition: CVObjectDetection.h:73
openshot::Clip::End
float End() const override
Get end position (in seconds) of clip (trim end of video), which can be affected by the time curve.
Definition: Clip.cpp:423
openshot::CVObjectDetection::detectionsData
std::map< size_t, CVDetectionData > detectionsData
Definition: CVObjectDetection.h:127
openshot::Clip::GetFrame
std::shared_ptr< openshot::Frame > GetFrame(int64_t clip_frame_number) override
Get an openshot::Frame object for a specific frame number of this clip. The image size and number of ...
Definition: Clip.cpp:458
openshot::CVDetectionData::confidences
std::vector< float > confidences
Definition: CVObjectDetection.h:70
SortTracker::frameTrackingResult
std::vector< TrackingBox > frameTrackingResult
Definition: sort.hpp:57
openshot::CVObjectDetection::GetDetectionData
CVDetectionData GetDetectionData(size_t frameId)
Definition: CVObjectDetection.cpp:705
openshot::CVObjectDetection::AddFrameDataToProto
void AddFrameDataToProto(pb_objdetect::Frame *pbFrameData, CVDetectionData &dData)
Definition: CVObjectDetection.cpp:810
openshot::CVDetectionData::objectIds
std::vector< int > objectIds
Definition: CVObjectDetection.h:72
ClassScore
Definition: KalmanTracker.h:20
openshot::CVObjectDetection::_LoadObjDetectdData
bool _LoadObjDetectdData()
Definition: CVObjectDetection.cpp:921
openshot::CVObjectMaskData::rle
std::vector< uint32_t > rle
Definition: CVObjectDetection.h:39
openshot::CVDetectionData::frameId
size_t frameId
Definition: CVObjectDetection.h:68
openshot::CVDetectionData
Definition: CVObjectDetection.h:51
openshot::ClipBase::Start
void Start(float value)
Set start position (in seconds) of clip (trim start of video)
Definition: ClipBase.cpp:42
openshot::CVObjectMaskData::HasData
bool HasData() const
Definition: CVObjectDetection.h:41
openshot::CVObjectMaskData
Definition: CVObjectDetection.h:36
SortTracker::GetIOU
static double GetIOU(cv::Rect_< float > bb_test, cv::Rect_< float > bb_gt)
Definition: sort.cpp:89
openshot::CVObjectDetection::SetJsonValue
void SetJsonValue(const Json::Value root)
Load Json::Value into this object.
Definition: CVObjectDetection.cpp:857
openshot::CVDetectionData::boxes
std::vector< cv::Rect_< float > > boxes
Definition: CVObjectDetection.h:71
openshot::CVObjectDetection::SetJson
void SetJson(const std::string value)
Load JSON string into this object.
Definition: CVObjectDetection.cpp:839
ProcessingController
Definition: ProcessingController.h:20
openshot::CVTrackedMaskData
Definition: CVObjectDetection.h:44
openshot::CVObjectDetection::ValidateONNXModel
static std::string ValidateONNXModel(std::string modelPath)
Definition: CVObjectDetection.cpp:253
openshot::CVObjectDetection::SaveObjDetectedData
bool SaveObjDetectedData()
Protobuf Save and Load methods.
Definition: CVObjectDetection.cpp:767
SortTracker::update
void update(std::vector< cv::Rect > detection, int frame_count, double image_diagonal, std::vector< float > confidences, std::vector< int > classIds, std::vector< std::vector< ClassScore >> classScores={})
Definition: sort.cpp:151
openshot::Clip::Reader
void Reader(openshot::ReaderBase *new_reader)
Set the current reader.
Definition: Clip.cpp:340
ProcessingController::SetProgress
void SetProgress(uint p)
Definition: ProcessingController.h:52
openshot::CVObjectDetection::detectObjectsClip
void detectObjectsClip(openshot::Clip &video, size_t start=0, size_t end=0, bool process_interval=false)
Definition: CVObjectDetection.cpp:278
openshot::CVObjectMaskData::height
int height
Definition: CVObjectDetection.h:38
Exceptions.h
Header file for all Exception classes.