diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3d91d91 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +__pycache__ +data/protoc +build/ +dist/ +tf_trt_models.egg-info/ +data/ +logs/ diff --git a/README.md b/README.md index fa99207..878e180 100644 --- a/README.md +++ b/README.md @@ -1,252 +1,165 @@ TensorFlow/TensorRT Models on Jetson ==================================== -

-landing graphic -

- -This repository contains scripts and documentation to use TensorFlow image classification and object detection models on NVIDIA Jetson. The models are sourced from the [TensorFlow models repository](https://github.com/tensorflow/models) -and optimized using TensorRT. +This repository was forked from NVIDIA's [tf_trt_models](https://github.com/NVIDIA-Jetson/tf_trt_models) repository. It contains sctipts to optimize TensorFlow models with TensorRT, as well as scripts for testing/demo. The models are sourced from the [TensorFlow models repository](https://github.com/tensorflow/models). This repository mainly focuses on **object detection** models. * [Setup](#setup) -* [Image Classification](#ic) - * [Models](#ic_models) - * [Download pretrained model](#ic_download) - * [Build TensorRT / Jetson compatible graph](#ic_build) - * [Optimize with TensorRT](#ic_trt) - * [Jupyter Notebook Sample](#ic_notebook) - * [Train for custom task](#ic_train) * [Object Detection](#od) * [Models](#od_models) - * [Download pretrained model](#od_download) - * [Build TensorRT / Jetson compatible graph](#od_build) - * [Optimize with TensorRT](#od_trt) - * [Jupyter Notebook Sample](#od_notebook) - * [Train for custom task](#od_train) + * [Real-time Object Detection with TensorRT Optimized Models](#rt_od) +* [Applying the Hand Detector Model](#hand) Setup ----- -1. Flash your Jetson TX2 with JetPack 3.2 (including TensorRT). -2. Install miscellaneous dependencies on Jetson +Refer to these blog posts for more details: - ``` - sudo apt-get install python-pip python-matplotlib python-pil - ``` - -3. Install TensorFlow 1.7+ (with TensorRT support). Download the [pre-built pip wheel](https://devtalk.nvidia.com/default/topic/1031300/jetson-tx2/tensorflow-1-8-wheel-with-jetpack-3-2-/) and install using pip. +* [TensorFlow/TensorRT Models on Jetson TX2](https://jkjung-avt.github.io/tf-trt-models/) +* [TensorFlow/TensorRT (TF-TRT) Revisited](https://jkjung-avt.github.io/tf-trt-revisited/). - ``` - pip install tensorflow-1.8.0-cp27-cp27mu-linux_aarch64.whl --user - ``` - - or if you're using Python 3. - - ``` - pip3 install tensorflow-1.8.0-cp35-cp35m-linux_aarch64.whl --user - ``` +Otherwise, here are the steps: - -4. Clone this repository +1. Flash the target Jetson TX2 system with either JetPack-3.2.1 (TensorRT 3.0 GA included) or JetPack 3.3 (TensorRT 4.0 GA). (I have also tested the code on Jetson Nano with JetPack-4.2.) +2. Install OpenCV 3.4.x on Jetson. Reference: [How to Install OpenCV (3.4.0) on Jetson TX2](https://jkjung-avt.github.io/opencv3-on-tx2/) or [Installing OpenCV 3.4.6 on Jetson Nano](https://jkjung-avt.github.io/opencv-on-nano/). +3. Download and install tensorflow-1.8.0 (with TensorRT support). More specifically, download [this pip wheel](https://nvidia.app.box.com/v/TF180-Py35-wTRT) if you are using JetPack-3.2.1, or [this pip wheel](https://drive.google.com/open?id=1bAUNe26fKgGXuJiZYs1eT2ig8SCj2gW-) if you are using JetPack-3.3. Then install it with `pip3`. - ``` - git clone --recursive https://github.com/NVIDIA-Jetson/tf_trt_models.git - cd tf_trt_models - ``` + ``` + $ sudo pip3 install tensorflow-1.8.0-cp35-cp35m-linux_aarch64.whl + ``` -5. Run the installation script + **2019-05-24 update:** Originally I encountered "extremely long TF-TRT model loading time issue" when I tested with tensorflow-1.9.0+. That's why I recommended tensorflow-1.8.0 before. Recently I realized the issue was due to the python3 'protobuf' module, and I have a solution. I documented the solution in my [TensorFlow/TensorRT (TF-TRT) Revisted](https://jkjung-avt.github.io/tf-trt-revisited/) post. With the solution applied, you could actually use any tensorflow of version 1.8.0 or higher, as long as it has the TenroRT support. For example, you could [build/install tensorflow-1.12.2 from source](https://jkjung-avt.github.io/build-tensorflow-1.12.2/), or just use a pip3 wheel provided by NVIDIA. - ``` - ./install.sh - ``` - - or if you want to specify python intepreter - - ``` - ./install.sh python3 - ``` +4. Clone this repository. (Do use this repository instead of NVIDIA's original tf_trt_models repository, if you would like to run the script described below.) - -Image Classification --------------------- + ``` + $ cd ~/project + $ git clone --recursive https://github.com/jkjung-avt/tf_trt_models + $ cd tf_trt_models + ``` +5. Run the installation script. -classification + ``` + $ ./install.sh + ``` + +Object Detection +---------------- + +Please refer to the original [NVIDIA-Jetson/tf_trt_models](https://github.com/NVIDIA-Jetson/tf_trt_models) for code snippets which demonstrate how to download pretrained object detection models, how to build TensorFlow graph and how to optimize the models with TensorRT. - + ### Models -| Model | Input Size | TF-TRT TX2 | TF TX2 | -|:------|:----------:|-----------:|-------:| -| inception_v1 | 224x224 | 7.36ms | 22.9ms | -| inception_v2 | 224x224 | 9.08ms | 31.8ms | -| inception_v3 | 299x299 | 20.7ms | 74.3ms | -| inception_v4 | 299x299 | 38.5ms | 129ms | -| inception_resnet_v2 | 299x299 | | 158ms | -| resnet_v1_50 | 224x224 | 12.5ms | 55.1ms | -| resnet_v1_101 | 224x224 | 20.6ms | 91.0ms | -| resnet_v1_152 | 224x224 | 28.9ms | 124ms | -| resnet_v2_50 | 299x299 | 26.5ms | 73.4ms | -| resnet_v2_101 | 299x299 | 46.9ms | | -| resnet_v2_152 | 299x299 | 69.0ms | | -| mobilenet_v1_0p25_128 | 128x128 | 3.72ms | 7.99ms | -| mobilenet_v1_0p5_160 | 160x160 | 4.47ms | 8.69ms | -| mobilenet_v1_1p0_224 | 224x224 | 11.1ms | 17.3ms | - -**TF** - Original TensorFlow graph (FP32) - -**TF-TRT** - TensorRT optimized graph (FP16) - -The above benchmark timings were gathered after placing the Jetson TX2 in MAX-N -mode. To do this, run the following commands in a terminal: +The author has tested various TensorFlow object detection models and shared the result on [NVIDIA's Jetson TX2 Developer Forum](https://devtalk.nvidia.com/default/topic/1037019/jetson-tx2/tensorflow-object-detection-and-image-classification-accelerated-for-nvidia-jetson/post/5288250/#5288250). + +Note the benchmark timings were gathered after the Jetson TX2 was placed in MAX-N mode. To set TX2 into MAX-N mode, run the following commands in a terminal: ``` -sudo nvpmodel -m 0 -sudo ~/jetson_clocks.sh +$ sudo nvpmodel -m 0 +$ sudo ~/jetson_clocks.sh ``` - -### Download pretrained model - -As a convenience, we provide a script to download pretrained models sourced from the -TensorFlow models repository. + +### Real-time Object Detection with TensorRT Optimized Models -```python -from tf_trt_models.classification import download_classification_checkpoint +The `camera_tf_trt.py` script supports video inputs from one of the following sources: (1) a video file, say mp4, (2) an image file, say jpg or png, (3) an RTSP stream from an IP CAM, (4) a USB webcam, (5) the Jetson onboard camera. Check out the help message about how to invoke the script with a specific video source. -checkpoint_path = download_classification_checkpoint('inception_v2') ``` -To manually download the pretrained models, follow the links [here](https://github.com/tensorflow/models/tree/master/research/slim#Pretrained). +$ python3 camera_tf_trt.py --help +usage: camera_tf_trt.py [-h] [--file] [--image] [--filename FILENAME] [--rtsp] + [--uri RTSP_URI] [--latency RTSP_LATENCY] [--usb] + [--vid VIDEO_DEV] [--width IMAGE_WIDTH] + [--height IMAGE_HEIGHT] [--model MODEL] [--build] + [--tensorboard] [--labelmap LABELMAP_FILE] + [--num-classes NUM_CLASSES] [--confidence CONF_TH] + +This script captures and displays live camera video, and does real-time object +detection with TF-TRT model on Jetson TX2/TX1 + +optional arguments: + -h, --help show this help message and exit + --file use a video file as input (remember to also set + --filename) + --image use an image file as input (remember to also set + --filename) + --filename FILENAME video file name, e.g. test.mp4 + --rtsp use IP CAM (remember to also set --uri) + --uri RTSP_URI RTSP URI, e.g. rtsp://192.168.1.64:554 + --latency RTSP_LATENCY + latency in ms for RTSP [200] + --usb use USB webcam (remember to also set --vid) + --vid VIDEO_DEV device # of USB webcam (/dev/video?) [1] + --width IMAGE_WIDTH image width [1280] + --height IMAGE_HEIGHT + image height [720] + --model MODEL tf-trt object detecion model [ssd_inception_v2_coco] + --build re-build TRT pb file (instead of usingthe previously + built version) + --tensorboard write optimized graph summary to TensorBoard + --labelmap LABELMAP_FILE + [third_party/models/research/object_detection/data/msc + oco_label_map.pbtxt] + --num-classes NUM_CLASSES + number of object classes [90] + --confidence CONF_TH confidence threshold [0.3] +``` - +The `--model` option could only be set to `ssd_inception_v2_coco` (default) or `ssd_mobilenet_v1` now. It would likely be extended to support more object detection models in the future. The `--build` option only needs to be done once for each object detection model. The TensorRT optimized graph would be saved/cached into a protobuf file, so that later invocations of the script could load the cached graph directly without going through the optimization process again. -### Build TensorRT / Jetson compatible graph -```python -from tf_trt_models.classification import build_classification_graph +Example #1: build TensorRT optimized 'ssd_mobilenet_v1_coco' model and run real-time object detection with USB webcam. -frozen_graph, input_names, output_names = build_classification_graph( - model='inception_v2', - checkpoint=checkpoint_path, - num_classes=1001 -) ``` - -### Optimize with TensorRT - -```python -import tensorflow.contrib.tensorrt as trt - -trt_graph = trt.create_inference_graph( - input_graph_def=frozen_graph, - outputs=output_names, - max_batch_size=1, - max_workspace_size_bytes=1 << 25, - precision_mode='FP16', - minimum_segment_size=50 -) +$ python3 camera_tf_trt.py --usb --model ssd_mobilenet_v1_coco --build ``` - -### Jupyter Notebook Sample - -For a comprehensive example of performing the above steps and executing on a real -image, see the [jupyter notebook sample](examples/classification/classification.ipynb). - - -### Train for custom task - -Follow the documentation from the [TensorFlow models repository](https://github.com/tensorflow/models/tree/master/research/slim). -Once you have obtained a checkpoint, proceed with building the graph and optimizing -with TensorRT as shown above. - - -Object Detection ----------------- - -detection - - -### Models - -| Model | Input Size | TF-TRT TX2 | TF TX2 | -|:------|:----------:|-----------:|-------:| -| ssd_mobilenet_v1_coco | 300x300 | 50.5ms | 72.9ms | -| ssd_inception_v2_coco | 300x300 | 54.4ms | 132ms | - -**TF** - Original TensorFlow graph (FP32) - -**TF-TRT** - TensorRT optimized graph (FP16) - -The above benchmark timings were gathered after placing the Jetson TX2 in MAX-N -mode. To do this, run the following commands in a terminal: +Example #2: verify the optimized 'ssd_mobilenet_v1_coco' model with NVIDIA's original 'huskies.jpg' picture. ``` -sudo nvpmodel -m 0 -sudo ~/jetson_clocks.sh +$ python3 camera_tf_trt.py --image --filename examples/detection/data/huskies.jpg --model ssd_mobilenet_v1_coco ``` - -### Download pretrained model - -As a convenience, we provide a script to download pretrained model weights and config files sourced from the -TensorFlow models repository. +Here is the result of example #2. -```python -from tf_trt_models.detection import download_detection_model +

+MobileNet V1 SSD detection result on huskies.jpg +

-config_path, checkpoint_path = download_detection_model('ssd_inception_v2_coco') -``` -To manually download the pretrained models, follow the links [here](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md). + +Applying the Hand Detector Model +-------------------------------- -> **Important:** Some of the object detection configuration files have a very low non-maximum suppression score threshold (ie. 1e-8). -> This can cause unnecessarily large CPU post-processing load. Depending on your application, it may be advisable to raise -> this value to something larger (like 0.3) for improved performance. We do this for the above benchmark timings. This can be done by modifying the configuration -> file directly before calling build_detection_graph. The parameter can be found for example in this [line](https://github.com/tensorflow/models/blob/master/research/object_detection/samples/configs/ssd_mobilenet_v1_coco.config#L130). +Refer to the following blog posts for more details: - -### Build TensorRT / Jetson compatible graph +* [Training a Hand Detector with TensorFlow Object Detection API](https://jkjung-avt.github.io/hand-detection-tutorial/) +* [Deploying the Hand Detector onto Jetson TX2](https://jkjung-avt.github.io/hand-detection-on-tx2/) -```python -from tf_trt_models.detection import build_detection_graph +After you've trained your own hand detector with one of the following models, you'll be able to optimize the model with TF-TRT and run it on TX2. -frozen_graph, input_names, output_names = build_detection_graph( - config=config_path, - checkpoint=checkpoint_path -) ``` - - -### Optimize with TensorRT - -```python -import tensorflow.contrib.tensorrt as trt - -trt_graph = trt.create_inference_graph( - input_graph_def=frozen_graph, - outputs=output_names, - max_batch_size=1, - max_workspace_size_bytes=1 << 25, - precision_mode='FP16', - minimum_segment_size=50 -) +ssd_mobilenet_v1_egohands +ssd_mobilenet_v2_egohands +ssdlite_mobilenet_v2_egohands +ssd_inception_v2_egohands +faster_rcnn_resnet50_egohands +faster_rcnn_resnet101_egohands +faster_rcnn_inception_v2_egohands ``` - -### Jupyter Notebook Sample - -For a comprehensive example of performing the above steps and executing on a real -image, see the [jupyter notebook sample](examples/detection/detection.ipynb). +Be sure to copy your trained model checkpoint files into the corresponding `data/xxx_egohands/` folder. Say, you've done that for `ssd_mobilenet_v1_egohands`. Then you could optimize the model and test it with an image like this: - -### Train for custom task +```shell +$ python3 camera_tf_trt.py --image \ + --filename jk-son-hands.jpg \ + --model ssd_mobilenet_v1_egohands \ + --labelmap data/egohands_label_map.pbtxt \ + --num-classes 1 \ + --build +``` -Follow the documentation from the [TensorFlow models repository](https://github.com/tensorflow/models/tree/master/research/object_detection). -Once you have obtained a checkpoint, proceed with building the graph and optimizing -with TensorRT as shown above. Please note that all models are not tested so -you should use an object detection -config file during training that resembles one of the ssd_mobilenet_v1_coco or -ssd_inception_v2_coco models. Some config parameters may be modified, such as the number of -classes, image size, non-max supression parameters, but the performance may vary. +

+JK's son's hands +

diff --git a/camera_tf_trt.py b/camera_tf_trt.py new file mode 100644 index 0000000..d844dd4 --- /dev/null +++ b/camera_tf_trt.py @@ -0,0 +1,199 @@ +"""camera_tf_trt.py + +This is a Camera TensorFlow/TensorRT Object Detection sample code for +Jetson TX2 or TX1. This script captures and displays video from either +a video file, an image file, an IP CAM, a USB webcam, or the Tegra +onboard camera, and do real-time object detection with example TensorRT +optimized SSD models in NVIDIA's 'tf_trt_models' repository. Refer to +README.md inside this repository for more information. + +This code is written and maintained by JK Jung . +""" + + +import sys +import time +import logging +import argparse + +import numpy as np +import cv2 +import tensorflow as tf +import tensorflow.contrib.tensorrt as trt + +from utils.camera import add_camera_args, Camera +from utils.od_utils import read_label_map, build_trt_pb, load_trt_pb, \ + write_graph_tensorboard, detect +from utils.visualization import BBoxVisualization + + +# Constants +DEFAULT_MODEL = 'ssd_inception_v2_coco' +DEFAULT_LABELMAP = 'third_party/models/research/object_detection/' \ + 'data/mscoco_label_map.pbtxt' +WINDOW_NAME = 'CameraTFTRTDemo' +BBOX_COLOR = (0, 255, 0) # green + + +def parse_args(): + """Parse input arguments.""" + desc = ('This script captures and displays live camera video, ' + 'and does real-time object detection with TF-TRT model ' + 'on Jetson TX2/TX1/Nano') + parser = argparse.ArgumentParser(description=desc) + parser = add_camera_args(parser) + parser.add_argument('--model', dest='model', + help='tf-trt object detecion model ' + '[{}]'.format(DEFAULT_MODEL), + default=DEFAULT_MODEL, type=str) + parser.add_argument('--build', dest='do_build', + help='re-build TRT pb file (instead of using' + 'the previously built version)', + action='store_true') + parser.add_argument('--tensorboard', dest='do_tensorboard', + help='write optimized graph summary to TensorBoard', + action='store_true') + parser.add_argument('--labelmap', dest='labelmap_file', + help='[{}]'.format(DEFAULT_LABELMAP), + default=DEFAULT_LABELMAP, type=str) + parser.add_argument('--num-classes', dest='num_classes', + help='(deprecated and not used) number of object ' + 'classes', type=int) + parser.add_argument('--confidence', dest='conf_th', + help='confidence threshold [0.3]', + default=0.3, type=float) + args = parser.parse_args() + return args + + +def open_display_window(width, height): + """Open the cv2 window for displaying images with bounding boxeses.""" + cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) + cv2.resizeWindow(WINDOW_NAME, width, height) + cv2.moveWindow(WINDOW_NAME, 0, 0) + cv2.setWindowTitle(WINDOW_NAME, 'Camera TFTRT Object Detection Demo ' + 'for Jetson TX2/TX1') + + +def draw_help_and_fps(img, fps): + """Draw help message and fps number at top-left corner of the image.""" + help_text = "'Esc' to Quit, 'H' for FPS & Help, 'F' for Fullscreen" + font = cv2.FONT_HERSHEY_PLAIN + line = cv2.LINE_AA + + fps_text = 'FPS: {:.1f}'.format(fps) + cv2.putText(img, help_text, (11, 20), font, 1.0, (32, 32, 32), 4, line) + cv2.putText(img, help_text, (10, 20), font, 1.0, (240, 240, 240), 1, line) + cv2.putText(img, fps_text, (11, 50), font, 1.0, (32, 32, 32), 4, line) + cv2.putText(img, fps_text, (10, 50), font, 1.0, (240, 240, 240), 1, line) + return img + + +def set_full_screen(full_scrn): + """Set display window to full screen or not.""" + prop = cv2.WINDOW_FULLSCREEN if full_scrn else cv2.WINDOW_NORMAL + cv2.setWindowProperty(WINDOW_NAME, cv2.WND_PROP_FULLSCREEN, prop) + + +def loop_and_detect(cam, tf_sess, conf_th, vis, od_type): + """Loop, grab images from camera, and do object detection. + + # Arguments + cam: the camera object (video source). + tf_sess: TensorFlow/TensorRT session to run SSD object detection. + conf_th: confidence/score threshold for object detection. + vis: for visualization. + """ + show_fps = True + full_scrn = False + fps = 0.0 + tic = time.time() + while True: + if cv2.getWindowProperty(WINDOW_NAME, 0) < 0: + # Check to see if the user has closed the display window. + # If yes, terminate the while loop. + break + + img = cam.read() + if img is not None: + box, conf, cls = detect(img, tf_sess, conf_th, od_type=od_type) + img = vis.draw_bboxes(img, box, conf, cls) + if show_fps: + img = draw_help_and_fps(img, fps) + cv2.imshow(WINDOW_NAME, img) + toc = time.time() + curr_fps = 1.0 / (toc - tic) + # calculate an exponentially decaying average of fps number + fps = curr_fps if fps == 0.0 else (fps*0.9 + curr_fps*0.1) + tic = toc + + key = cv2.waitKey(1) + if key == 27: # ESC key: quit program + break + elif key == ord('H') or key == ord('h'): # Toggle help/fps + show_fps = not show_fps + elif key == ord('F') or key == ord('f'): # Toggle fullscreen + full_scrn = not full_scrn + set_full_screen(full_scrn) + + +def main(): + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__name__) + # Ask tensorflow logger not to propagate logs to parent (which causes + # duplicated logging) + logging.getLogger('tensorflow').propagate = False + + args = parse_args() + logger.info('called with args: %s' % args) + + # build the class (index/name) dictionary from labelmap file + logger.info('reading label map') + cls_dict = read_label_map(args.labelmap_file) + + pb_path = './data/{}_trt.pb'.format(args.model) + log_path = './logs/{}_trt'.format(args.model) + if args.do_build: + logger.info('building TRT graph and saving to pb: %s' % pb_path) + build_trt_pb(args.model, pb_path) + + logger.info('opening camera device/file') + cam = Camera(args) + cam.open() + if not cam.is_opened: + sys.exit('Failed to open camera!') + + logger.info('loading TRT graph from pb: %s' % pb_path) + trt_graph = load_trt_pb(pb_path) + + logger.info('starting up TensorFlow session') + tf_config = tf.ConfigProto() + tf_config.gpu_options.allow_growth = True + tf_sess = tf.Session(config=tf_config, graph=trt_graph) + + if args.do_tensorboard: + logger.info('writing graph summary to TensorBoard') + write_graph_tensorboard(tf_sess, log_path) + + logger.info('warming up the TRT graph with a dummy image') + od_type = 'faster_rcnn' if 'faster_rcnn' in args.model else 'ssd' + dummy_img = np.zeros((720, 1280, 3), dtype=np.uint8) + _, _, _ = detect(dummy_img, tf_sess, conf_th=.3, od_type=od_type) + + cam.start() # ask the camera to start grabbing images + + # grab image and do object detection (until stopped by user) + logger.info('starting to loop and detect') + vis = BBoxVisualization(cls_dict) + open_display_window(cam.img_width, cam.img_height) + loop_and_detect(cam, tf_sess, args.conf_th, vis, od_type=od_type) + + logger.info('cleaning up') + cam.stop() # terminate the sub-thread in camera + tf_sess.close() + cam.release() + cv2.destroyAllWindows() + + +if __name__ == '__main__': + main() diff --git a/data/egohands_label_map.pbtxt b/data/egohands_label_map.pbtxt new file mode 100644 index 0000000..6834348 --- /dev/null +++ b/data/egohands_label_map.pbtxt @@ -0,0 +1,4 @@ +item { + id: 1 + name: 'hand' +} diff --git a/data/faster_rcnn_inception_v2_egohands.config b/data/faster_rcnn_inception_v2_egohands.config new file mode 100644 index 0000000..b13141a --- /dev/null +++ b/data/faster_rcnn_inception_v2_egohands.config @@ -0,0 +1,146 @@ +# Faster R-CNN with Inception v2, configured for egohands dataset. +# Users should configure the fine_tune_checkpoint field in the train config as +# well as the label_map_path and input_path fields in the train_input_reader and +# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that +# should be configured. + +model { + faster_rcnn { + num_classes: 1 + image_resizer { + #keep_aspect_ratio_resizer { + # min_dimension: 600 + # max_dimension: 1024 + #} + # Use fixed input image dimension, refer to: + # https://github.com/NVIDIA-Jetson/tf_trt_models/issues/6#issuecomment-423098067 + fixed_shape_resizer { + height: 576 + width: 1024 + } + } + feature_extractor { + type: 'faster_rcnn_inception_v2' + first_stage_features_stride: 16 + } + first_stage_anchor_generator { + grid_anchor_generator { + scales: [0.25, 0.5, 1.0, 2.0] + aspect_ratios: [0.5, 1.0, 2.0] + height_stride: 16 + width_stride: 16 + } + } + first_stage_box_predictor_conv_hyperparams { + op: CONV + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.01 + } + } + } + first_stage_nms_score_threshold: 0.0 + first_stage_nms_iou_threshold: 0.7 + #first_stage_max_proposals: 300 + first_stage_max_proposals: 32 + first_stage_localization_loss_weight: 2.0 + first_stage_objectness_loss_weight: 1.0 + initial_crop_size: 14 + maxpool_kernel_size: 2 + maxpool_stride: 2 + second_stage_box_predictor { + mask_rcnn_box_predictor { + use_dropout: false + dropout_keep_probability: 1.0 + fc_hyperparams { + op: FC + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + variance_scaling_initializer { + factor: 1.0 + uniform: true + mode: FAN_AVG + } + } + } + } + } + second_stage_post_processing { + batch_non_max_suppression { + score_threshold: 0.0 + iou_threshold: 0.6 + #max_detections_per_class: 100 + max_detections_per_class: 32 + #max_total_detections: 300 + max_total_detections: 32 + } + score_converter: SOFTMAX + } + second_stage_batch_size: 32 + second_stage_localization_loss_weight: 2.0 + second_stage_classification_loss_weight: 1.0 + } +} + +train_config: { + batch_size: 1 + optimizer { + momentum_optimizer: { + learning_rate: { + manual_step_learning_rate { + initial_learning_rate: 0.0002 + schedule { + step: 30000 + learning_rate: .00002 + } + schedule { + step: 48000 + learning_rate: .000002 + } + } + } + momentum_optimizer_value: 0.9 + } + use_moving_average: false + } + gradient_clipping_by_norm: 10.0 + fine_tune_checkpoint: "faster_rcnn_inception_v2_coco_2018_01_28/model.ckpt" + from_detection_checkpoint: true + num_steps: 50000 + data_augmentation_options { + random_horizontal_flip { + } + } +} + +train_input_reader: { + tf_record_input_reader { + input_path: "data/egohands_train.tfrecord" + } + label_map_path: "data/egohands_label_map.pbtxt" +} + +eval_config: { + num_examples: 500 + # Note: The below line limits the evaluation process to 10 evaluations. + # Remove the below line to evaluate indefinitely. + max_evals: 10 +} + +eval_input_reader: { + tf_record_input_reader { + input_path: "data/egohands_val.tfrecord" + } + label_map_path: "data/egohands_label_map.pbtxt" + shuffle: false + num_readers: 1 +} diff --git a/data/faster_rcnn_inception_v2_egohands/README.md b/data/faster_rcnn_inception_v2_egohands/README.md new file mode 100644 index 0000000..426fc49 --- /dev/null +++ b/data/faster_rcnn_inception_v2_egohands/README.md @@ -0,0 +1,9 @@ +# faster_rcnn_inception_v2_egohands + +Copy your own trained 'faster_rcnn_inception_v2_egohands' model checkpoint files into this directory: + +``` +model.ckpt-50000.data-00000-of-00001 +model.ckpt-50000.index +model.ckpt-50000.meta +``` diff --git a/data/faster_rcnn_resnet101_egohands.config b/data/faster_rcnn_resnet101_egohands.config new file mode 100644 index 0000000..11d66f1 --- /dev/null +++ b/data/faster_rcnn_resnet101_egohands.config @@ -0,0 +1,146 @@ +# Faster R-CNN with Resnet-101 (v1), configuration for egohands dataset. +# Users should configure the fine_tune_checkpoint field in the train config as +# well as the label_map_path and input_path fields in the train_input_reader and +# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that +# should be configured. + +model { + faster_rcnn { + num_classes: 1 + image_resizer { + #keep_aspect_ratio_resizer { + # min_dimension: 600 + # max_dimension: 1024 + #} + # Use fixed input image dimension, refer to: + # https://github.com/NVIDIA-Jetson/tf_trt_models/issues/6#issuecomment-423098067 + fixed_shape_resizer { + height: 576 + width: 1024 + } + } + feature_extractor { + type: 'faster_rcnn_resnet101' + first_stage_features_stride: 16 + } + first_stage_anchor_generator { + grid_anchor_generator { + scales: [0.25, 0.5, 1.0, 2.0] + aspect_ratios: [0.5, 1.0, 2.0] + height_stride: 16 + width_stride: 16 + } + } + first_stage_box_predictor_conv_hyperparams { + op: CONV + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.01 + } + } + } + first_stage_nms_score_threshold: 0.0 + first_stage_nms_iou_threshold: 0.7 + #first_stage_max_proposals: 300 + first_stage_max_proposals: 32 + first_stage_localization_loss_weight: 2.0 + first_stage_objectness_loss_weight: 1.0 + initial_crop_size: 14 + maxpool_kernel_size: 2 + maxpool_stride: 2 + second_stage_box_predictor { + mask_rcnn_box_predictor { + use_dropout: false + dropout_keep_probability: 1.0 + fc_hyperparams { + op: FC + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + variance_scaling_initializer { + factor: 1.0 + uniform: true + mode: FAN_AVG + } + } + } + } + } + second_stage_post_processing { + batch_non_max_suppression { + score_threshold: 0.0 + iou_threshold: 0.6 + #max_detections_per_class: 100 + max_detections_per_class: 32 + #max_total_detections: 300 + max_total_detections: 32 + } + score_converter: SOFTMAX + } + second_stage_batch_size: 32 + second_stage_localization_loss_weight: 2.0 + second_stage_classification_loss_weight: 1.0 + } +} + +train_config: { + batch_size: 1 + optimizer { + momentum_optimizer: { + learning_rate: { + manual_step_learning_rate { + initial_learning_rate: 0.0003 + schedule { + step: 30000 + learning_rate: .00003 + } + schedule { + step: 48000 + learning_rate: .000003 + } + } + } + momentum_optimizer_value: 0.9 + } + use_moving_average: false + } + gradient_clipping_by_norm: 10.0 + fine_tune_checkpoint: "faster_rcnn_resnet101_coco_2018_01_28/model.ckpt" + from_detection_checkpoint: true + num_steps: 50000 + data_augmentation_options { + random_horizontal_flip { + } + } +} + +train_input_reader: { + tf_record_input_reader { + input_path: "data/egohands_train.tfrecord" + } + label_map_path: "data/egohands_label_map.pbtxt" +} + +eval_config: { + num_examples: 500 + # Note: The below line limits the evaluation process to 10 evaluations. + # Remove the below line to evaluate indefinitely. + max_evals: 10 +} + +eval_input_reader: { + tf_record_input_reader { + input_path: "data/egohands_val.tfrecord" + } + label_map_path: "data/egohands_label_map.pbtxt" + shuffle: false + num_readers: 1 +} diff --git a/data/faster_rcnn_resnet101_egohands/README.md b/data/faster_rcnn_resnet101_egohands/README.md new file mode 100644 index 0000000..af9c466 --- /dev/null +++ b/data/faster_rcnn_resnet101_egohands/README.md @@ -0,0 +1,9 @@ +# faster_rcnn_resnet101_egohands + +Copy your own trained 'faster_rcnn_resnet101_egohands' model checkpoint files into this directory: + +``` +model.ckpt-50000.data-00000-of-00001 +model.ckpt-50000.index +model.ckpt-50000.meta +``` diff --git a/data/faster_rcnn_resnet50_egohands.config b/data/faster_rcnn_resnet50_egohands.config new file mode 100644 index 0000000..1dc050e --- /dev/null +++ b/data/faster_rcnn_resnet50_egohands.config @@ -0,0 +1,146 @@ +# Faster R-CNN with Resnet-50 (v1), configuration for egohands dataset. +# Users should configure the fine_tune_checkpoint field in the train config as +# well as the label_map_path and input_path fields in the train_input_reader and +# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that +# should be configured. + +model { + faster_rcnn { + num_classes: 1 + image_resizer { + #keep_aspect_ratio_resizer { + # min_dimension: 600 + # max_dimension: 1024 + #} + # Use fixed input image dimension, refer to: + # https://github.com/NVIDIA-Jetson/tf_trt_models/issues/6#issuecomment-423098067 + fixed_shape_resizer { + height: 576 + width: 1024 + } + } + feature_extractor { + type: 'faster_rcnn_resnet50' + first_stage_features_stride: 16 + } + first_stage_anchor_generator { + grid_anchor_generator { + scales: [0.25, 0.5, 1.0, 2.0] + aspect_ratios: [0.5, 1.0, 2.0] + height_stride: 16 + width_stride: 16 + } + } + first_stage_box_predictor_conv_hyperparams { + op: CONV + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.01 + } + } + } + first_stage_nms_score_threshold: 0.0 + first_stage_nms_iou_threshold: 0.7 + #first_stage_max_proposals: 300 + first_stage_max_proposals: 32 + first_stage_localization_loss_weight: 2.0 + first_stage_objectness_loss_weight: 1.0 + initial_crop_size: 14 + maxpool_kernel_size: 2 + maxpool_stride: 2 + second_stage_box_predictor { + mask_rcnn_box_predictor { + use_dropout: false + dropout_keep_probability: 1.0 + fc_hyperparams { + op: FC + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + variance_scaling_initializer { + factor: 1.0 + uniform: true + mode: FAN_AVG + } + } + } + } + } + second_stage_post_processing { + batch_non_max_suppression { + score_threshold: 0.0 + iou_threshold: 0.6 + #max_detections_per_class: 100 + max_detections_per_class: 32 + #max_total_detections: 300 + max_total_detections: 32 + } + score_converter: SOFTMAX + } + second_stage_batch_size: 32 + second_stage_localization_loss_weight: 2.0 + second_stage_classification_loss_weight: 1.0 + } +} + +train_config: { + batch_size: 1 + optimizer { + momentum_optimizer: { + learning_rate: { + manual_step_learning_rate { + initial_learning_rate: 0.0003 + schedule { + step: 30000 + learning_rate: .00003 + } + schedule { + step: 48000 + learning_rate: .000003 + } + } + } + momentum_optimizer_value: 0.9 + } + use_moving_average: false + } + gradient_clipping_by_norm: 10.0 + fine_tune_checkpoint: "faster_rcnn_resnet50_coco_2018_01_28/model.ckpt" + from_detection_checkpoint: true + num_steps: 50000 + data_augmentation_options { + random_horizontal_flip { + } + } +} + +train_input_reader: { + tf_record_input_reader { + input_path: "data/egohands_train.tfrecord" + } + label_map_path: "data/egohands_label_map.pbtxt" +} + +eval_config: { + num_examples: 500 + # Note: The below line limits the evaluation process to 10 evaluations. + # Remove the below line to evaluate indefinitely. + max_evals: 10 +} + +eval_input_reader: { + tf_record_input_reader { + input_path: "data/egohands_val.tfrecord" + } + label_map_path: "data/egohands_label_map.pbtxt" + shuffle: false + num_readers: 1 +} diff --git a/data/faster_rcnn_resnet50_egohands/README.md b/data/faster_rcnn_resnet50_egohands/README.md new file mode 100644 index 0000000..d17a88a --- /dev/null +++ b/data/faster_rcnn_resnet50_egohands/README.md @@ -0,0 +1,9 @@ +# faster_rcnn_resnet50_egohands + +Copy your own trained 'faster_rcnn_resnet50_egohands' model checkpoint files into this directory: + +``` +model.ckpt-50000.data-00000-of-00001 +model.ckpt-50000.index +model.ckpt-50000.meta +``` diff --git a/data/huskies_detected.png b/data/huskies_detected.png new file mode 100644 index 0000000..85382ae Binary files /dev/null and b/data/huskies_detected.png differ diff --git a/data/rfcn_resnet101_egohands.config b/data/rfcn_resnet101_egohands.config new file mode 100644 index 0000000..822cdc6 --- /dev/null +++ b/data/rfcn_resnet101_egohands.config @@ -0,0 +1,143 @@ +# R-FCN with Resnet-101 (v1), configuration for egohands dataset. +# Users should configure the fine_tune_checkpoint field in the train config as +# well as the label_map_path and input_path fields in the train_input_reader and +# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that +# should be configured. + +model { + faster_rcnn { + num_classes: 1 + image_resizer { + #keep_aspect_ratio_resizer { + # min_dimension: 600 + # max_dimension: 1024 + #} + # Use fixed input image dimension, refer to: + # https://github.com/NVIDIA-Jetson/tf_trt_models/issues/6#issuecomment-423098067 + fixed_shape_resizer { + height: 576 + width: 1024 + } + } + feature_extractor { + type: 'faster_rcnn_resnet101' + first_stage_features_stride: 16 + } + first_stage_anchor_generator { + grid_anchor_generator { + scales: [0.25, 0.5, 1.0, 2.0] + aspect_ratios: [0.5, 1.0, 2.0] + height_stride: 16 + width_stride: 16 + } + } + first_stage_box_predictor_conv_hyperparams { + op: CONV + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.01 + } + } + } + first_stage_nms_score_threshold: 0.0 + first_stage_nms_iou_threshold: 0.7 + #first_stage_max_proposals: 300 + first_stage_max_proposals: 32 + first_stage_localization_loss_weight: 2.0 + first_stage_objectness_loss_weight: 1.0 + second_stage_box_predictor { + rfcn_box_predictor { + conv_hyperparams { + op: CONV + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.01 + } + } + } + crop_height: 18 + crop_width: 18 + num_spatial_bins_height: 3 + num_spatial_bins_width: 3 + } + } + second_stage_post_processing { + batch_non_max_suppression { + score_threshold: 0.0 + iou_threshold: 0.6 + #max_detections_per_class: 100 + max_detections_per_class: 32 + #max_total_detections: 300 + max_total_detections: 32 + } + score_converter: SOFTMAX + } + second_stage_batch_size: 32 + second_stage_localization_loss_weight: 2.0 + second_stage_classification_loss_weight: 1.0 + } +} + +train_config: { + batch_size: 1 + optimizer { + momentum_optimizer: { + learning_rate: { + manual_step_learning_rate { + initial_learning_rate: 0.0003 + schedule { + step: 30000 + learning_rate: .00003 + } + schedule { + step: 48000 + learning_rate: .000003 + } + } + } + momentum_optimizer_value: 0.9 + } + use_moving_average: false + } + gradient_clipping_by_norm: 10.0 + fine_tune_checkpoint: "rfcn_resnet101_coco_2018_01_28/model.ckpt" + from_detection_checkpoint: true + num_steps: 50000 + data_augmentation_options { + random_horizontal_flip { + } + } +} + +train_input_reader: { + tf_record_input_reader { + input_path: "data/egohands_train.tfrecord" + } + label_map_path: "data/egohands_label_map.pbtxt" +} + +eval_config: { + num_examples: 500 + # Note: The below line limits the evaluation process to 10 evaluations. + # Remove the below line to evaluate indefinitely. + max_evals: 10 +} + +eval_input_reader: { + tf_record_input_reader { + input_path: "data/egohands_val.tfrecord" + } + label_map_path: "data/egohands_label_map.pbtxt" + shuffle: false + num_readers: 1 +} diff --git a/data/rfcn_resnet101_egohands/README.md b/data/rfcn_resnet101_egohands/README.md new file mode 100644 index 0000000..e37681d --- /dev/null +++ b/data/rfcn_resnet101_egohands/README.md @@ -0,0 +1,9 @@ +# rfcn_resnet101_egohands + +Copy your own trained 'rfcn_resnet101_egohands' model checkpoint files into this directory: + +``` +model.ckpt-50000.data-00000-of-00001 +model.ckpt-50000.index +model.ckpt-50000.meta +``` diff --git a/data/ssd_inception_v2_egohands.config b/data/ssd_inception_v2_egohands.config new file mode 100644 index 0000000..8d1fedc --- /dev/null +++ b/data/ssd_inception_v2_egohands.config @@ -0,0 +1,189 @@ +# SSD with Inception v2, configured for egohands dataset. +# Users should configure the fine_tune_checkpoint field in the train config as +# well as the label_map_path and input_path fields in the train_input_reader and +# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that +# should be configured. + +model { + ssd { + num_classes: 1 + box_coder { + faster_rcnn_box_coder { + y_scale: 10.0 + x_scale: 10.0 + height_scale: 5.0 + width_scale: 5.0 + } + } + matcher { + argmax_matcher { + matched_threshold: 0.5 + unmatched_threshold: 0.5 + ignore_thresholds: false + negatives_lower_than_unmatched: true + force_match_for_each_row: true + } + } + similarity_calculator { + iou_similarity { + } + } + anchor_generator { + ssd_anchor_generator { + num_layers: 6 + min_scale: 0.1 # Use a smaller min_scale so that our trained model might be able to detect smaller objects (hands) more accurately + max_scale: 0.9 + aspect_ratios: 1.0 + aspect_ratios: 2.0 + aspect_ratios: 0.5 + aspect_ratios: 3.0 + aspect_ratios: 0.3333 + reduce_boxes_in_lowest_layer: true + } + } + image_resizer { + fixed_shape_resizer { + height: 300 + width: 300 + } + } + box_predictor { + convolutional_box_predictor { + min_depth: 0 + max_depth: 0 + num_layers_before_predictor: 0 + use_dropout: false + dropout_keep_probability: 0.8 + kernel_size: 3 + box_code_size: 4 + apply_sigmoid_to_scores: false + conv_hyperparams { + activation: RELU_6, + regularizer { + l2_regularizer { + weight: 0.00004 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.03 + mean: 0.0 + } + } + } + } + } + feature_extractor { + type: 'ssd_inception_v2' + min_depth: 16 + depth_multiplier: 1.0 + conv_hyperparams { + activation: RELU_6, + regularizer { + l2_regularizer { + weight: 0.00004 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.03 + mean: 0.0 + } + } + batch_norm { + train: true, + scale: true, + center: true, + decay: 0.9997, + epsilon: 0.001, + } + } + override_base_feature_extractor_hyperparams: true + } + loss { + classification_loss { + weighted_sigmoid { + } + } + localization_loss { + weighted_smooth_l1 { + } + } + hard_example_miner { + num_hard_examples: 3000 + iou_threshold: 0.99 + loss_type: CLASSIFICATION + max_negatives_per_positive: 3 + min_negatives_per_image: 0 + } + classification_weight: 1.0 + localization_weight: 1.0 + } + normalize_loss_by_num_matches: true + post_processing { + batch_non_max_suppression { + score_threshold: 1e-8 + iou_threshold: 0.6 + max_detections_per_class: 100 + max_total_detections: 100 + } + score_converter: SIGMOID + } + } +} + +train_config: { + batch_size: 24 + optimizer { + rms_prop_optimizer: { + learning_rate: { + exponential_decay_learning_rate { + initial_learning_rate: 0.004 + decay_steps: 1000 + decay_factor: 0.8 + } + } + momentum_optimizer_value: 0.9 + decay: 0.9 + epsilon: 1.0 + } + } + fine_tune_checkpoint: "ssd_inception_v2_coco_2018_01_28/model.ckpt" + from_detection_checkpoint: true + # Note: The below line limits the training process to 200K steps, which we + # empirically found to be sufficient enough to train the pets dataset. This + # effectively bypasses the learning rate schedule (the learning rate will + # never decay). Remove the below line to train indefinitely. + num_steps: 20000 + data_augmentation_options { + random_horizontal_flip { + } + } + data_augmentation_options { + ssd_random_crop { + } + } +} + +train_input_reader: { + tf_record_input_reader { + input_path: "data/egohands_train.tfrecord" + } + label_map_path: "data/egohands_label_map.pbtxt" +} + +eval_config: { + num_examples: 500 + # Note: The below line limits the evaluation process to 10 evaluations. + # Remove the below line to evaluate indefinitely. + max_evals: 10 +} + +eval_input_reader: { + tf_record_input_reader { + input_path: "data/egohands_val.tfrecord" + } + label_map_path: "data/egohands_label_map.pbtxt" + shuffle: false + num_readers: 1 +} diff --git a/data/ssd_inception_v2_egohands/README.md b/data/ssd_inception_v2_egohands/README.md new file mode 100644 index 0000000..87c6f07 --- /dev/null +++ b/data/ssd_inception_v2_egohands/README.md @@ -0,0 +1,9 @@ +# ssd_inception_v2_egohands + +Copy your own trained 'ssd_inception_v2_egohands' model checkpoint files into this directory: + +``` +model.ckpt-20000.data-00000-of-00001 +model.ckpt-20000.index +model.ckpt-20000.meta +``` diff --git a/data/ssd_mobilenet_v1_egohands.config b/data/ssd_mobilenet_v1_egohands.config new file mode 100644 index 0000000..5d08164 --- /dev/null +++ b/data/ssd_mobilenet_v1_egohands.config @@ -0,0 +1,194 @@ +# SSD with Mobilenet v1, configured for egohands dataset. +# Users should configure the fine_tune_checkpoint field in the train config as +# well as the label_map_path and input_path fields in the train_input_reader and +# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that +# should be configured. + +model { + ssd { + num_classes: 1 + box_coder { + faster_rcnn_box_coder { + y_scale: 10.0 + x_scale: 10.0 + height_scale: 5.0 + width_scale: 5.0 + } + } + matcher { + argmax_matcher { + matched_threshold: 0.5 + unmatched_threshold: 0.5 + ignore_thresholds: false + negatives_lower_than_unmatched: true + force_match_for_each_row: true + } + } + similarity_calculator { + iou_similarity { + } + } + anchor_generator { + ssd_anchor_generator { + num_layers: 6 + min_scale: 0.1 # Use a smaller min_scale so that our trained model might be able to detect smaller objects (hands) more accurately + max_scale: 0.9 + aspect_ratios: 1.0 + aspect_ratios: 2.0 + aspect_ratios: 0.5 + aspect_ratios: 3.0 + aspect_ratios: 0.3333 + } + } + image_resizer { + fixed_shape_resizer { + height: 300 + width: 300 + } + } + box_predictor { + convolutional_box_predictor { + min_depth: 0 + max_depth: 0 + num_layers_before_predictor: 0 + use_dropout: false + dropout_keep_probability: 0.8 + kernel_size: 1 + box_code_size: 4 + apply_sigmoid_to_scores: false + conv_hyperparams { + activation: RELU_6, + regularizer { + l2_regularizer { + weight: 0.00004 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.03 + mean: 0.0 + } + } + batch_norm { + train: true, + scale: true, + center: true, + decay: 0.9997, + epsilon: 0.001, + } + } + } + } + feature_extractor { + type: 'ssd_mobilenet_v1' + min_depth: 16 + depth_multiplier: 1.0 + conv_hyperparams { + activation: RELU_6, + regularizer { + l2_regularizer { + weight: 0.00004 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.03 + mean: 0.0 + } + } + batch_norm { + train: true, + scale: true, + center: true, + decay: 0.9997, + epsilon: 0.001, + } + } + } + loss { + classification_loss { + weighted_sigmoid { + } + } + localization_loss { + weighted_smooth_l1 { + } + } + hard_example_miner { + num_hard_examples: 3000 + iou_threshold: 0.99 + loss_type: CLASSIFICATION + max_negatives_per_positive: 3 + min_negatives_per_image: 0 + } + classification_weight: 1.0 + localization_weight: 1.0 + } + normalize_loss_by_num_matches: true + post_processing { + batch_non_max_suppression { + #score_threshold: 1e-8 + score_threshold: 0.3 + iou_threshold: 0.6 + max_detections_per_class: 100 + max_total_detections: 100 + } + score_converter: SIGMOID + } + } +} + +train_config: { + batch_size: 24 + optimizer { + rms_prop_optimizer: { + learning_rate: { + exponential_decay_learning_rate { + initial_learning_rate: 0.004 + decay_steps: 1000 + decay_factor: 0.8 + } + } + momentum_optimizer_value: 0.9 + decay: 0.9 + epsilon: 1.0 + } + } + fine_tune_checkpoint: "ssd_mobilenet_v1_coco_2018_01_28/model.ckpt" + from_detection_checkpoint: true + load_all_detection_checkpoint_vars: true + # Note: The below line limits the training process to 200K steps, which we + # empirically found to be sufficient enough to train the pets dataset. This + # effectively bypasses the learning rate schedule (the learning rate will + # never decay). Remove the below line to train indefinitely. + num_steps: 20000 + data_augmentation_options { + random_horizontal_flip { + } + } + data_augmentation_options { + ssd_random_crop { + } + } +} + +train_input_reader: { + tf_record_input_reader { + input_path: "data/egohands_train.tfrecord" + } + label_map_path: "data/egohands_label_map.pbtxt" +} + +eval_config: { + metrics_set: "coco_detection_metrics" + num_examples: 500 +} + +eval_input_reader: { + tf_record_input_reader { + input_path: "data/egohands_val.tfrecord" + } + label_map_path: "data/egohands_label_map.pbtxt" + shuffle: false + num_readers: 1 +} diff --git a/data/ssd_mobilenet_v1_egohands/README.md b/data/ssd_mobilenet_v1_egohands/README.md new file mode 100644 index 0000000..7fd83d8 --- /dev/null +++ b/data/ssd_mobilenet_v1_egohands/README.md @@ -0,0 +1,9 @@ +# ssd_mobilenet_v1_egohands + +Copy your own trained 'ssd_mobilenet_v1_egohands' model checkpoint files into this directory: + +``` +model.ckpt-20000.data-00000-of-00001 +model.ckpt-20000.index +model.ckpt-20000.meta +``` diff --git a/data/ssd_mobilenet_v2_egohands.config b/data/ssd_mobilenet_v2_egohands.config new file mode 100644 index 0000000..15897eb --- /dev/null +++ b/data/ssd_mobilenet_v2_egohands.config @@ -0,0 +1,185 @@ +# SSD with Mobilenet v2, configured for egohands dataset. +# This file was extracted modified from 'pipeline.config' in +# http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_coco_2018_03_29.tar.gz + +model { + ssd { + num_classes: 1 + image_resizer { + fixed_shape_resizer { + height: 300 + width: 300 + } + } + feature_extractor { + type: "ssd_mobilenet_v2" + depth_multiplier: 1.0 + min_depth: 16 + conv_hyperparams { + regularizer { + l2_regularizer { + weight: 4e-05 + } + } + initializer { + truncated_normal_initializer { + mean: 0.0 + stddev: 0.03 + } + } + activation: RELU_6 + batch_norm { + decay: 0.9997 + center: true + scale: true + epsilon: 0.001 + train: true + } + } + #batch_norm_trainable: true + use_depthwise: true + } + box_coder { + faster_rcnn_box_coder { + y_scale: 10.0 + x_scale: 10.0 + height_scale: 5.0 + width_scale: 5.0 + } + } + matcher { + argmax_matcher { + matched_threshold: 0.5 + unmatched_threshold: 0.5 + ignore_thresholds: false + negatives_lower_than_unmatched: true + force_match_for_each_row: true + } + } + similarity_calculator { + iou_similarity { + } + } + box_predictor { + convolutional_box_predictor { + conv_hyperparams { + regularizer { + l2_regularizer { + weight: 0.00004 + } + } + initializer { + truncated_normal_initializer { + mean: 0.0 + stddev: 0.03 + } + } + activation: RELU_6 + batch_norm { + decay: 0.9997 + center: true + scale: true + epsilon: 0.001 + train: true + } + } + min_depth: 0 + max_depth: 0 + num_layers_before_predictor: 0 + use_dropout: false + dropout_keep_probability: 0.8 + kernel_size: 3 + box_code_size: 4 + apply_sigmoid_to_scores: false + } + } + anchor_generator { + ssd_anchor_generator { + num_layers: 6 + min_scale: 0.05 + max_scale: 0.95 + aspect_ratios: 1.0 + aspect_ratios: 2.0 + aspect_ratios: 0.5 + aspect_ratios: 3.0 + aspect_ratios: 0.33 + } + } + post_processing { + batch_non_max_suppression { + score_threshold: 1e-8 + iou_threshold: 0.6 + max_detections_per_class: 100 + max_total_detections: 100 + } + score_converter: SIGMOID + } + normalize_loss_by_num_matches: true + loss { + localization_loss { + weighted_smooth_l1 { + } + } + classification_loss { + weighted_sigmoid { + } + } + hard_example_miner { + num_hard_examples: 3000 + iou_threshold: 0.99 + loss_type: CLASSIFICATION + max_negatives_per_positive: 3 + min_negatives_per_image: 3 + } + classification_weight: 1.0 + localization_weight: 1.0 + } + } +} +train_config { + batch_size: 24 + data_augmentation_options { + random_horizontal_flip { + } + } + data_augmentation_options { + ssd_random_crop { + } + } + optimizer { + rms_prop_optimizer { + learning_rate { + exponential_decay_learning_rate { + initial_learning_rate: 0.004 + decay_steps: 1000 + decay_factor: 0.8 + } + } + momentum_optimizer_value: 0.9 + decay: 0.9 + epsilon: 1.0 + } + } + fine_tune_checkpoint: "ssd_mobilenet_v2_coco_2018_03_29/model.ckpt" + num_steps: 20000 + fine_tune_checkpoint_type: "detection" +} +train_input_reader { + label_map_path: "data/egohands_label_map.pbtxt" + tf_record_input_reader { + input_path: "data/egohands_train.tfrecord" + } +} +eval_config { + num_examples: 500 + max_evals: 10 + use_moving_averages: false +} +eval_input_reader { + label_map_path: "data/egohands_label_map.pbtxt" + shuffle: false + num_readers: 1 + tf_record_input_reader { + input_path: "data/egohands_val.tfrecord" + } +} diff --git a/data/ssd_mobilenet_v2_egohands/README.md b/data/ssd_mobilenet_v2_egohands/README.md new file mode 100644 index 0000000..2e833b9 --- /dev/null +++ b/data/ssd_mobilenet_v2_egohands/README.md @@ -0,0 +1,9 @@ +# ssd_mobilenet_v2_egohands + +Copy your own trained 'ssd_mobilenet_v2_egohands' model checkpoint files into this directory: + +``` +model.ckpt-20000.data-00000-of-00001 +model.ckpt-20000.index +model.ckpt-20000.meta +``` diff --git a/data/ssdlite_mobilenet_v2_egohands.config b/data/ssdlite_mobilenet_v2_egohands.config new file mode 100644 index 0000000..f1fdb6e --- /dev/null +++ b/data/ssdlite_mobilenet_v2_egohands.config @@ -0,0 +1,197 @@ +# SSDLite with Mobilenet v2 configuration for egohands dataset. +# Users should configure the fine_tune_checkpoint field in the train config as +# well as the label_map_path and input_path fields in the train_input_reader and +# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that +# should be configured. + +model { + ssd { + num_classes: 1 + box_coder { + faster_rcnn_box_coder { + y_scale: 10.0 + x_scale: 10.0 + height_scale: 5.0 + width_scale: 5.0 + } + } + matcher { + argmax_matcher { + matched_threshold: 0.5 + unmatched_threshold: 0.5 + ignore_thresholds: false + negatives_lower_than_unmatched: true + force_match_for_each_row: true + } + } + similarity_calculator { + iou_similarity { + } + } + anchor_generator { + ssd_anchor_generator { + num_layers: 6 + min_scale: 0.05 + max_scale: 0.95 + aspect_ratios: 1.0 + aspect_ratios: 2.0 + aspect_ratios: 0.5 + aspect_ratios: 3.0 + aspect_ratios: 0.3333 + } + } + image_resizer { + fixed_shape_resizer { + height: 300 + width: 300 + } + } + box_predictor { + convolutional_box_predictor { + min_depth: 0 + max_depth: 0 + num_layers_before_predictor: 0 + use_dropout: false + dropout_keep_probability: 0.8 + kernel_size: 3 + use_depthwise: true + box_code_size: 4 + apply_sigmoid_to_scores: false + conv_hyperparams { + activation: RELU_6, + regularizer { + l2_regularizer { + weight: 0.00004 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.03 + mean: 0.0 + } + } + batch_norm { + train: true, + scale: true, + center: true, + decay: 0.9997, + epsilon: 0.001, + } + } + } + } + feature_extractor { + type: 'ssd_mobilenet_v2' + min_depth: 16 + depth_multiplier: 1.0 + use_depthwise: true + conv_hyperparams { + activation: RELU_6, + regularizer { + l2_regularizer { + weight: 0.00004 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.03 + mean: 0.0 + } + } + batch_norm { + train: true, + scale: true, + center: true, + decay: 0.9997, + epsilon: 0.001, + } + } + } + loss { + classification_loss { + weighted_sigmoid { + } + } + localization_loss { + weighted_smooth_l1 { + } + } + hard_example_miner { + num_hard_examples: 3000 + iou_threshold: 0.99 + loss_type: CLASSIFICATION + max_negatives_per_positive: 3 + min_negatives_per_image: 3 + } + classification_weight: 1.0 + localization_weight: 1.0 + } + normalize_loss_by_num_matches: true + post_processing { + batch_non_max_suppression { + #score_threshold: 1e-8 + score_threshold: 0.3 + iou_threshold: 0.6 + max_detections_per_class: 100 + max_total_detections: 100 + } + score_converter: SIGMOID + } + } +} + +train_config: { + batch_size: 24 + optimizer { + rms_prop_optimizer: { + learning_rate: { + exponential_decay_learning_rate { + initial_learning_rate: 0.004 + decay_steps: 1000 + decay_factor: 0.8 + } + } + momentum_optimizer_value: 0.9 + decay: 0.9 + epsilon: 1.0 + } + } + fine_tune_checkpoint: "ssdlite_mobilenet_v2_coco_2018_05_09/model.ckpt" + fine_tune_checkpoint_type: "detection" + # Note: The below line limits the training process to 200K steps, which we + # empirically found to be sufficient enough to train the pets dataset. This + # effectively bypasses the learning rate schedule (the learning rate will + # never decay). Remove the below line to train indefinitely. + num_steps: 20000 + data_augmentation_options { + random_horizontal_flip { + } + } + data_augmentation_options { + ssd_random_crop { + } + } +} + +train_input_reader: { + tf_record_input_reader { + input_path: "data/egohands_train.tfrecord" + } + label_map_path: "data/egohands_label_map.pbtxt" +} + +eval_config: { + num_examples: 500 + # Note: The below line limits the evaluation process to 10 evaluations. + # Remove the below line to evaluate indefinitely. + max_evals: 10 +} + +eval_input_reader: { + tf_record_input_reader { + input_path: "data/egohands_val.tfrecord" + } + label_map_path: "data/egohands_label_map.pbtxt" + shuffle: false + num_readers: 1 +} diff --git a/data/ssdlite_mobilenet_v2_egohands/README.md b/data/ssdlite_mobilenet_v2_egohands/README.md new file mode 100644 index 0000000..1562726 --- /dev/null +++ b/data/ssdlite_mobilenet_v2_egohands/README.md @@ -0,0 +1,9 @@ +# ssdlite_mobilenet_v2_egohands + +Copy your own trained 'ssdlite_mobilenet_v2_egohands' model checkpoint files into this directory: + +``` +model.ckpt-20000.data-00000-of-00001 +model.ckpt-20000.index +model.ckpt-20000.meta +``` diff --git a/install.sh b/install.sh index c569148..c341527 100755 --- a/install.sh +++ b/install.sh @@ -2,14 +2,9 @@ INSTALL_PROTOC=$PWD/scripts/install_protoc.sh MODELS_DIR=$PWD/third_party/models +PYTHON=python3 -PYTHON=python - -if [ $# -eq 1 ]; then - PYTHON=$1 -fi - -echo $PYTHON +echo "Using $PYTHON" # install protoc echo "Downloading protoc" @@ -21,6 +16,31 @@ git submodule update --init pushd $MODELS_DIR/research echo $PWD + +sed -i "842s/print 'Scores and tpfp per class label: {}'.format(class_index)/print('Scores and tpfp per class label: {}'.format(class_index))/" \ + object_detection/utils/object_detection_evaluation.py +sed -i '843s/print tp_fp_labels/print(tp_fp_labels)/' \ + object_detection/utils/object_detection_evaluation.py +sed -i '844s/print scores/print(scores)/' \ + object_detection/utils/object_detection_evaluation.py +sed -i "157s/print '--annotation_type expected value is 1 or 2.'/print('--annotation_type expected value is 1 or 2.')/" \ + object_detection/dataset_tools/oid_hierarchical_labels_expansion.py +sed -i '516s/print num_classes, num_anchors/print(num_classes, num_anchors)/' \ + object_detection/meta_architectures/ssd_meta_arch_test.py +sed -i '147s/print /print(/' \ + object_detection/dataset_tools/oid_hierarchical_labels_expansion.py +sed -i '149s/labels_file"""$/[optional]labels_file""")/' \ + object_detection/dataset_tools/oid_hierarchical_labels_expansion.py +sed -i '281s/loss_tensor in losses_dict.itervalues()/_, loss_tensor in losses_dict.items()/' \ + object_detection/model_lib.py +sed -i '380s/category_index.values(),/list(category_index.values()),/' \ + object_detection/model_lib.py +sed -i '390s/iteritems()/items()/' \ + object_detection/model_lib.py +sed -i '168s/range(num_boundaries),/list(range(num_boundaries)),/' \ + object_detection/utils/learning_schedules.py +sed -i '225s/reversed(zip(output_feature_map_keys, output_feature_maps_list)))/reversed(list(zip(output_feature_map_keys, output_feature_maps_list))))/' \ + object_detection/models/feature_map_generators.py echo "Installing object detection library" echo $PROTOC $PROTOC object_detection/protos/*.proto --python_out=. diff --git a/scripts/install_protoc.sh b/scripts/install_protoc.sh index 560e06d..e0ee023 100644 --- a/scripts/install_protoc.sh +++ b/scripts/install_protoc.sh @@ -3,17 +3,19 @@ BASE_URL="https://github.com/google/protobuf/releases/download/v3.5.1/" PROTOC_DIR=data/protoc -mkdir -p $PROTOC_DIR -pushd $PROTOC_DIR -ARCH=$(uname -m) -if [ "$ARCH" == "aarch64" ] ; then - filename="protoc-3.5.1-linux-aarch_64.zip" -elif [ "$ARCH" == "x86_64" ] ; then - filename="protoc-3.5.1-linux-x86_64.zip" -else - echo ERROR: $ARCH not supported. - exit 1; +if [ ! -r $PROTOC_DIR/bin/protoc ]; then + mkdir -p $PROTOC_DIR + pushd $PROTOC_DIR + ARCH=$(uname -m) + if [ "$ARCH" == "aarch64" ] ; then + filename="protoc-3.5.1-linux-aarch_64.zip" + elif [ "$ARCH" == "x86_64" ] ; then + filename="protoc-3.5.1-linux-x86_64.zip" + else + echo ERROR: $ARCH not supported. + exit 1; + fi + wget --no-check-certificate ${BASE_URL}${filename} + unzip ${filename} + popd fi -wget --no-check-certificate ${BASE_URL}${filename} -unzip ${filename} -popd diff --git a/tf_trt_models/detection.py b/tf_trt_models/detection.py index f1341fc..eff100d 100644 --- a/tf_trt_models/detection.py +++ b/tf_trt_models/detection.py @@ -1,21 +1,22 @@ -from object_detection.protos import pipeline_pb2 -from object_detection.protos import image_resizer_pb2 -from object_detection import exporter - import os import subprocess - from collections import namedtuple from google.protobuf import text_format import tensorflow as tf +from object_detection.protos import pipeline_pb2 +from object_detection.protos import image_resizer_pb2 +from object_detection import exporter from .graph_utils import force_nms_cpu as f_force_nms_cpu +from .graph_utils import force_2ndstage_cpu as f_force_2ndstage_cpu from .graph_utils import replace_relu6 as f_replace_relu6 from .graph_utils import remove_assert as f_remove_assert + DetectionModel = namedtuple('DetectionModel', ['name', 'url', 'extract_dir']) + INPUT_NAME='image_tensor' BOXES_NAME='detection_boxes' CLASSES_NAME='detection_classes' @@ -26,7 +27,6 @@ PIPELINE_CONFIG_NAME='pipeline.config' CHECKPOINT_PREFIX='model.ckpt' - MODELS = { 'ssd_mobilenet_v1_coco': DetectionModel( 'ssd_mobilenet_v1_coco', @@ -71,7 +71,12 @@ def get_input_names(model): def get_output_names(model): - output_names = [BOXES_NAME, CLASSES_NAME, SCORES_NAME, NUM_DETECTIONS_NAME] + output_names = [ + BOXES_NAME, + CLASSES_NAME, + SCORES_NAME, + NUM_DETECTIONS_NAME + ] if model == 'mask_rcnn_resnet50_atrous_coco': output_names.append(MASKS_NAME) return output_names @@ -82,21 +87,30 @@ def download_detection_model(model, output_dir='.'): global MODELS model_name = model - - model = MODELS[model_name] - subprocess.call(['mkdir', '-p', output_dir]) - tar_file = os.path.join(output_dir, os.path.basename(model.url)) - - config_path = os.path.join(output_dir, model.extract_dir, PIPELINE_CONFIG_NAME) - checkpoint_path = os.path.join(output_dir, model.extract_dir, CHECKPOINT_PREFIX) - - if not os.path.exists(os.path.join(output_dir, model.extract_dir)): - subprocess.call(['wget', model.url, '-O', tar_file]) - subprocess.call(['tar', '-xzf', tar_file, '-C', output_dir]) - - # hack fix to handle mobilenet_v2 config bug - subprocess.call(['sed', '-i', '/batch_norm_trainable/d', config_path]) - + if model_name in MODELS.keys(): + model = MODELS[model_name] + subprocess.call(['mkdir', '-p', output_dir]) + tar_file = os.path.join(output_dir, os.path.basename(model.url)) + extract_dir = model.extract_dir + config_path = os.path.join(output_dir, model.extract_dir, + PIPELINE_CONFIG_NAME) + checkpoint_path = os.path.join(output_dir, model.extract_dir, + CHECKPOINT_PREFIX) + if not os.path.exists(os.path.join(output_dir, model.extract_dir)): + subprocess.call(['wget', model.url, '-O', tar_file]) + subprocess.call(['tar', '-xzf', tar_file, '-C', output_dir]) + + # hack fix to handle mobilenet_v2 config bug + subprocess.call(['sed', '-i', '/batch_norm_trainable/d', config_path]) + else: + # assuming user is querying a self-trained 'egohands' model + if not os.path.exists(os.path.join(output_dir, model_name + '.config')): + raise FileNotFoundError + if not os.path.exists(os.path.join(output_dir, model_name)): + raise FileNotFoundError + config_path = os.path.join(output_dir, model_name + '.config') + checkpoint_path = os.path.join(output_dir, model_name, + CHECKPOINT_PREFIX) return config_path, checkpoint_path @@ -104,12 +118,13 @@ def build_detection_graph(config, checkpoint, batch_size=1, score_threshold=None, force_nms_cpu=True, + force_frcn2_cpu=True, replace_relu6=True, remove_assert=True, input_shape=None, output_dir='.generated_model'): """Builds a frozen graph for a pre-trained object detection model""" - + config_path = config checkpoint_path = checkpoint @@ -122,7 +137,7 @@ def build_detection_graph(config, checkpoint, if config.model.HasField('ssd'): config.model.ssd.feature_extractor.override_base_feature_extractor_hyperparams = True if score_threshold is not None: - config.model.ssd.post_processing.batch_non_max_suppression.score_threshold = score_threshold + config.model.ssd.post_processing.batch_non_max_suppression.score_threshold = score_threshold if input_shape is not None: config.model.ssd.image_resizer.fixed_shape_resizer.height = input_shape[0] config.model.ssd.image_resizer.fixed_shape_resizer.width = input_shape[1] @@ -143,10 +158,10 @@ def build_detection_graph(config, checkpoint, with tf.Session(config=tf_config) as tf_sess: with tf.Graph().as_default() as tf_graph: exporter.export_inference_graph( - 'image_tensor', - config, - checkpoint_path, - output_dir, + 'image_tensor', + config, + checkpoint_path, + output_dir, input_shape=[batch_size, None, None, 3] ) @@ -158,13 +173,17 @@ def build_detection_graph(config, checkpoint, # apply graph modifications if force_nms_cpu: frozen_graph = f_force_nms_cpu(frozen_graph) + if force_frcn2_cpu: + if 'faster_rcnn_' in config_path or 'rfcn_' in config_path: + frozen_graph = f_force_2ndstage_cpu(frozen_graph) if replace_relu6: frozen_graph = f_replace_relu6(frozen_graph) if remove_assert: - frozen_graph = f_remove_assert(frozen_graph) + if 'ssd_' in config_path or 'ssdlite_' in config_path: + frozen_graph = f_remove_assert(frozen_graph) # get input names - # TODO: handle mask_rcnn + # TODO: handle mask_rcnn input_names = [INPUT_NAME] output_names = [BOXES_NAME, CLASSES_NAME, SCORES_NAME, NUM_DETECTIONS_NAME] diff --git a/tf_trt_models/graph_utils.py b/tf_trt_models/graph_utils.py index 71cb198..e662b15 100644 --- a/tf_trt_models/graph_utils.py +++ b/tf_trt_models/graph_utils.py @@ -20,7 +20,7 @@ def make_relu6(output_name, input_name, const6_name='const6'): #tf_y = tf.nn.relu(tf.subtract(tf_6, tf.nn.relu(tf_x, name='relu1'), name='sub'), name='relu2') #tf_y = tf.subtract(tf_6, tf_y, name=output_name) tf_y = tf.subtract(tf_y1, tf_y2, name=output_name) - + graph_def = graph.as_graph_def() graph_def.node[-1].name = output_name @@ -34,7 +34,7 @@ def make_relu6(output_name, input_name, const6_name='const6'): for node in graph_def.node: if node.op == '_Neg': node.op = 'Neg' - + return graph_def @@ -47,7 +47,7 @@ def convert_relu6(graph_def, const6_name='const6'): if not has_const6: const6_graph_def = make_const6(const6_name=const6_name) graph_def.node.extend(const6_graph_def.node) - + for node in graph_def.node: if node.op == 'Relu6': input_name = node.input[0] @@ -55,7 +55,7 @@ def convert_relu6(graph_def, const6_name='const6'): relu6_graph_def = make_relu6(output_name, input_name, const6_name=const6_name) graph_def.node.remove(node) graph_def.node.extend(relu6_graph_def.node) - + return graph_def @@ -82,6 +82,13 @@ def force_nms_cpu(frozen_graph): return frozen_graph +def force_2ndstage_cpu(frozen_graph): + for node in frozen_graph.node: + if 'SecondStage' in node.name: + node.device = '/device:CPU:0' + return frozen_graph + + def replace_relu6(frozen_graph): return convert_relu6(frozen_graph) diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/camera.py b/utils/camera.py new file mode 100644 index 0000000..e7424a3 --- /dev/null +++ b/utils/camera.py @@ -0,0 +1,215 @@ +"""camera.py + +This code implements the Camera class, which encapsulates code to +handle IP CAM, USB webcam or the Jetson onboard camera. In +addition, this Camera class is further extended to take a video +file or an image file as input. +""" + + +import logging +import threading +import subprocess + +import numpy as np +import cv2 + + +def add_camera_args(parser): + """Add parser augument for camera options.""" + parser.add_argument('--file', dest='use_file', + help='use a video file as input (remember to ' + 'also set --filename)', + action='store_true') + parser.add_argument('--image', dest='use_image', + help='use an image file as input (remember to ' + 'also set --filename)', + action='store_true') + parser.add_argument('--filename', dest='filename', + help='video file name, e.g. test.mp4', + default=None, type=str) + parser.add_argument('--rtsp', dest='use_rtsp', + help='use IP CAM (remember to also set --uri)', + action='store_true') + parser.add_argument('--uri', dest='rtsp_uri', + help='RTSP URI, e.g. rtsp://192.168.1.64:554', + default=None, type=str) + parser.add_argument('--latency', dest='rtsp_latency', + help='latency in ms for RTSP [200]', + default=200, type=int) + parser.add_argument('--usb', dest='use_usb', + help='use USB webcam (remember to also set --vid)', + action='store_true') + parser.add_argument('--vid', dest='video_dev', + help='device # of USB webcam (/dev/video?) [0]', + default=0, type=int) + parser.add_argument('--width', dest='image_width', + help='image width [640]', + default=640, type=int) + parser.add_argument('--height', dest='image_height', + help='image height [480]', + default=480, type=int) + return parser + + +def open_cam_rtsp(uri, width, height, latency): + """Open an RTSP URI (IP CAM).""" + gst_str = ('rtspsrc location={} latency={} ! ' + 'rtph264depay ! h264parse ! omxh264dec ! ' + 'nvvidconv ! ' + 'video/x-raw, width=(int){}, height=(int){}, ' + 'format=(string)BGRx ! videoconvert ! ' + 'appsink').format(uri, latency, width, height) + return cv2.VideoCapture(gst_str, cv2.CAP_GSTREAMER) + + +def open_cam_usb(dev, width, height): + """Open a USB webcam. + + We want to set width and height here, otherwise we could just do: + return cv2.VideoCapture(dev) + """ + gst_str = ('v4l2src device=/dev/video{} ! ' + 'video/x-raw, width=(int){}, height=(int){} ! ' + 'videoconvert ! appsink').format(dev, width, height) + return cv2.VideoCapture(gst_str, cv2.CAP_GSTREAMER) + + +def open_cam_onboard(width, height): + """Open the Jetson onboard camera.""" + gst_elements = str(subprocess.check_output('gst-inspect-1.0')) + if 'nvcamerasrc' in gst_elements: + # On versions of L4T prior to 28.1, you might need to add + # 'flip-method=2' into gst_str below. + gst_str = ('nvcamerasrc ! ' + 'video/x-raw(memory:NVMM), ' + 'width=(int)2592, height=(int)1458, ' + 'format=(string)I420, framerate=(fraction)30/1 ! ' + 'nvvidconv ! ' + 'video/x-raw, width=(int){}, height=(int){}, ' + 'format=(string)BGRx ! ' + 'videoconvert ! appsink').format(width, height) + elif 'nvarguscamerasrc' in gst_elements: + gst_str = ('nvarguscamerasrc ! ' + 'video/x-raw(memory:NVMM), ' + 'width=(int)1920, height=(int)1080, ' + 'format=(string)NV12, framerate=(fraction)30/1 ! ' + 'nvvidconv flip-method=2 ! ' + 'video/x-raw, width=(int){}, height=(int){}, ' + 'format=(string)BGRx ! ' + 'videoconvert ! appsink').format(width, height) + else: + raise RuntimeError('onboard camera source not found!') + return cv2.VideoCapture(gst_str, cv2.CAP_GSTREAMER) + + +def grab_img(cam): + """This 'grab_img' function is designed to be run in the sub-thread. + Once started, this thread continues to grab a new image and put it + into the global 'img_handle', until 'thread_running' is set to False. + """ + while cam.thread_running: + _, cam.img_handle = cam.cap.read() + if cam.img_handle is None: + logging.warning('grab_img(): cap.read() returns None...') + break + cam.thread_running = False + + +class Camera(): + """Camera class which supports reading images from theses video sources: + + 1. Video file + 2. Image (jpg, png, etc.) file, repeating indefinitely + 3. RTSP (IP CAM) + 4. USB webcam + 5. Jetson onboard camera + """ + + def __init__(self, args): + self.args = args + self.is_opened = False + self.use_thread = False + self.thread_running = False + self.img_handle = None + self.img_width = 0 + self.img_height = 0 + self.cap = None + self.thread = None + + def open(self): + """Open camera based on command line arguments.""" + assert self.cap is None, 'Camera is already opened!' + args = self.args + if args.use_file: + self.cap = cv2.VideoCapture(args.filename) + # ignore image width/height settings here + self.use_thread = False + elif args.use_image: + self.cap = 'OK' + self.img_handle = cv2.imread(args.filename) + # ignore image width/height settings here + if self.img_handle is not None: + self.is_opened = True + self.img_height, self.img_width, _ = self.img_handle.shape + self.use_thread = False + elif args.use_rtsp: + self.cap = open_cam_rtsp( + args.rtsp_uri, + args.image_width, + args.image_height, + args.rtsp_latency + ) + self.use_thread = True + elif args.use_usb: + self.cap = open_cam_usb( + args.video_dev, + args.image_width, + args.image_height + ) + self.use_thread = True + else: # by default, use the jetson onboard camera + self.cap = open_cam_onboard( + args.image_width, + args.image_height + ) + self.use_thread = True + if self.cap != 'OK': + if self.cap.isOpened(): + # Try to grab the 1st image and determine width and height + _, img = self.cap.read() + if img is not None: + self.img_height, self.img_width, _ = img.shape + self.is_opened = True + + def start(self): + assert not self.thread_running + if self.use_thread: + self.thread_running = True + self.thread = threading.Thread(target=grab_img, args=(self,)) + self.thread.start() + + def stop(self): + self.thread_running = False + if self.use_thread: + self.thread.join() + + def read(self): + if self.args.use_file: + _, img = self.cap.read() + if img is None: + #logging.warning('grab_img(): cap.read() returns None...') + # looping around + self.cap.release() + self.cap = cv2.VideoCapture(self.args.filename) + _, img = self.cap.read() + return img + elif self.args.use_image: + return np.copy(self.img_handle) + else: + return self.img_handle + + def release(self): + assert not self.thread_running + if self.cap != 'OK': + self.cap.release() diff --git a/utils/egohands_models.py b/utils/egohands_models.py new file mode 100644 index 0000000..ee361b3 --- /dev/null +++ b/utils/egohands_models.py @@ -0,0 +1,44 @@ +"""egohands_models.py +""" + + +SUPPORTED_MODELS = { + 'ssd_mobilenet_v1_egohands': { + 'config_path': 'data/ssd_mobilenet_v1_egohands.config', + 'checkpoint_path': 'data/ssd_mobilenet_v1_egohands/model.ckpt-20000', + }, + 'ssd_mobilenet_v2_egohands': { + 'config_path': 'data/ssd_mobilenet_v2_egohands.config', + 'checkpoint_path': 'data/ssd_mobilenet_v2_egohands/model.ckpt-20000', + }, + 'ssdlite_mobilenet_v2_egohands': { + 'config_path': 'data/ssdlite_mobilenet_v2_egohands.config', + 'checkpoint_path': 'data/ssdlite_mobilenet_v2_egohands/model.ckpt-20000', + }, + 'ssd_inception_v2_egohands': { + 'config_path': 'data/ssd_inception_v2_egohands.config', + 'checkpoint_path': 'data/ssd_inception_v2_egohands/model.ckpt-20000', + }, + 'rfcn_resnet101_egohands': { + 'config_path': 'data/rfcn_resnet101_egohands.config', + 'checkpoint_path': 'data/rfcn_resnet101_egohands/model.ckpt-50000', + }, + 'faster_rcnn_resnet50_egohands': { + 'config_path': 'data/faster_rcnn_resnet50_egohands.config', + 'checkpoint_path': 'data/faster_rcnn_resnet50_egohands/model.ckpt-50000', + }, + 'faster_rcnn_resnet101_egohands': { + 'config_path': 'data/faster_rcnn_resnet101_egohands.config', + 'checkpoint_path': 'data/faster_rcnn_resnet101_egohands/model.ckpt-50000', + }, + 'faster_rcnn_inception_v2_egohands': { + 'config_path': 'data/faster_rcnn_inception_v2_egohands.config', + 'checkpoint_path': 'data/faster_rcnn_inception_v2_egohands/model.ckpt-50000', + }, +} + + +def get_egohands_model(model_name): + assert model_name in SUPPORTED_MODELS + return (SUPPORTED_MODELS[model_name]['config_path'], + SUPPORTED_MODELS[model_name]['checkpoint_path']) diff --git a/utils/od_utils.py b/utils/od_utils.py new file mode 100644 index 0000000..2755bb5 --- /dev/null +++ b/utils/od_utils.py @@ -0,0 +1,151 @@ +'''od_utils.py + +Object detection utility functions. +''' + + +import time + +import numpy as np +import cv2 +import tensorflow as tf +import tensorflow.contrib.tensorrt as trt + + +MEASURE_MODEL_TIME = False +avg_time = 0.0 + + +def read_label_map(path_to_labels): + """Read from the label map file and return a class dictionary which + maps class id (int) to the corresponding display name (string). + + Reference: + https://github.com/tensorflow/models/blob/master/research/object_detection/object_detection_tutorial.ipynb + """ + from object_detection.utils import label_map_util + + category_index = label_map_util.create_category_index_from_labelmap( + path_to_labels) + cls_dict = {int(x['id']): x['name'] for x in category_index.values()} + num_classes = max(c for c in cls_dict.keys()) + 1 + # add missing classes as, say,'CLS12' if any + return {i: cls_dict.get(i, 'CLS{}'.format(i)) for i in range(num_classes)} + + +def build_trt_pb(model_name, pb_path, download_dir='data'): + """Build TRT model from the original TF model, and save the graph + into a pb file for faster access in the future. + + The code was mostly taken from the following example by NVIDIA. + https://github.com/NVIDIA-Jetson/tf_trt_models/blob/master/examples/detection/detection.ipynb + """ + from tf_trt_models.detection import download_detection_model + from tf_trt_models.detection import build_detection_graph + from utils.egohands_models import get_egohands_model + + if 'coco' in model_name: + config_path, checkpoint_path = \ + download_detection_model(model_name, download_dir) + else: + config_path, checkpoint_path = \ + get_egohands_model(model_name) + frozen_graph_def, input_names, output_names = build_detection_graph( + config=config_path, + checkpoint=checkpoint_path + ) + trt_graph_def = trt.create_inference_graph( + input_graph_def=frozen_graph_def, + outputs=output_names, + max_batch_size=1, + max_workspace_size_bytes=1 << 26, + precision_mode='FP16', + minimum_segment_size=50 + ) + with open(pb_path, 'wb') as pf: + pf.write(trt_graph_def.SerializeToString()) + + +def load_trt_pb(pb_path): + """Load the TRT graph from the pre-build pb file.""" + trt_graph_def = tf.GraphDef() + with tf.gfile.GFile(pb_path, 'rb') as pf: + trt_graph_def.ParseFromString(pf.read()) + # force CPU device placement for NMS ops + for node in trt_graph_def.node: + if 'rfcn_' in pb_path and 'SecondStage' in node.name: + node.device = '/device:GPU:0' + if 'faster_rcnn_' in pb_path and 'SecondStage' in node.name: + node.device = '/device:GPU:0' + if 'NonMaxSuppression' in node.name: + node.device = '/device:CPU:0' + with tf.Graph().as_default() as trt_graph: + tf.import_graph_def(trt_graph_def, name='') + return trt_graph + + +def write_graph_tensorboard(sess, log_path): + """Write graph summary to log_path, so TensorBoard could display it.""" + writer = tf.summary.FileWriter(log_path) + writer.add_graph(sess.graph) + writer.flush() + writer.close() + + +def _preprocess(src, shape=None, to_rgb=True): + """Preprocess input image for the TF-TRT object detection model.""" + img = src.astype(np.uint8) + if shape: + img = cv2.resize(img, shape) + if to_rgb: + # BGR to RGB + img = img[..., ::-1] + return img + + +def _postprocess(img, boxes, scores, classes, conf_th): + """Postprocess ouput of the TF-TRT object detector.""" + h, w, _ = img.shape + out_box = boxes[0] * np.array([h, w, h, w]) + out_box = out_box.astype(np.int32) + out_conf = scores[0] + out_cls = classes[0].astype(np.int32) + + # only return bboxes with confidence score above threshold + mask = np.where(out_conf >= conf_th) + return (out_box[mask], out_conf[mask], out_cls[mask]) + + +def detect(origimg, tf_sess, conf_th, od_type='ssd'): + """Do object detection over 1 image.""" + global avg_time + + tf_input = tf_sess.graph.get_tensor_by_name('image_tensor:0') + tf_scores = tf_sess.graph.get_tensor_by_name('detection_scores:0') + tf_boxes = tf_sess.graph.get_tensor_by_name('detection_boxes:0') + tf_classes = tf_sess.graph.get_tensor_by_name('detection_classes:0') + #tf_num = tf_sess.graph.get_tensor_by_name('num_detections:0') + + if od_type == 'faster_rcnn': + img = _preprocess(origimg, (1024, 576)) + elif od_type == 'ssd': + img = _preprocess(origimg, (300, 300)) + else: + raise ValueError('bad object detector type: $s' % od_type) + + if MEASURE_MODEL_TIME: + tic = time.time() + + boxes_out, scores_out, classes_out = tf_sess.run( + [tf_boxes, tf_scores, tf_classes], + feed_dict={tf_input: img[None, ...]}) + + if MEASURE_MODEL_TIME: + td = (time.time() - tic) * 1000 # in ms + avg_time = avg_time * 0.9 + td * 0.1 + print('tf_sess.run() took {:.1f} ms on average'.format(avg_time)) + + box, conf, cls = _postprocess( + origimg, boxes_out, scores_out, classes_out, conf_th) + + return (box, conf, cls) diff --git a/utils/test_visualization.py b/utils/test_visualization.py new file mode 100644 index 0000000..d0ef13d --- /dev/null +++ b/utils/test_visualization.py @@ -0,0 +1,33 @@ +"""test_visualization.py +""" + + +import numpy as np +import cv2 +from visualization import BBoxVisualization + + +def main(): + """main + """ + cls_dict = {0: 'cat', 1: 'dog', 2: 'person'} + num_classes = 3 + boxes = np.array( + [[100, 100, 300, 200], + [100, 400, 300, 500], + [200, 250, 400, 300]], + dtype=np.int32 + ) + confs = np.array([0.5, 0.7, 0.9], dtype=np.float32) + clss = np.array([0, 1, 2], dtype=np.int32) + + img = cv2.imread('../examples/detection/data/huskies.jpg') + assert img is not None + vis = BBoxVisualization(cls_dict, num_classes) + img = vis.draw_bboxes(img, boxes, confs, clss) + cv2.imshow('Test BBoxVisualization', img) + cv2.waitKey(0) + + +if __name__ == '__main__': + main() diff --git a/utils/visualization.py b/utils/visualization.py new file mode 100644 index 0000000..6497716 --- /dev/null +++ b/utils/visualization.py @@ -0,0 +1,102 @@ +"""visualization.py + +The BBoxVisualization class implements drawing of nice looking +bounding boxes based on object detection results. +""" + + +import numpy as np +import cv2 + + +# Constants +ALPHA = 0.5 +FONT = cv2.FONT_HERSHEY_PLAIN +TEXT_SCALE = 1.0 +TEXT_THICKNESS = 1 +BLACK = (0, 0, 0) +WHITE = (255, 255, 255) + + +def gen_colors(num_colors): + """Generate different colors. + + # Arguments + num_colors: total number of colors/classes. + + # Output + bgrs: a list of (B, G, R) tuples which correspond to each of + the colors/classes. + """ + import random + import colorsys + + hsvs = [[float(x) / num_colors, 1., 0.7] for x in range(num_colors)] + random.seed(1234) + random.shuffle(hsvs) + rgbs = list(map(lambda x: list(colorsys.hsv_to_rgb(*x)), hsvs)) + bgrs = [(int(rgb[2] * 255), int(rgb[1] * 255), int(rgb[0] * 255)) + for rgb in rgbs] + return bgrs + + +def draw_boxed_text(img, text, topleft, color): + """Draw a transluent boxed text in white, overlayed on top of a + colored patch surrounded by a black border. FONT, TEXT_SCALE, + TEXT_THICKNESS and ALPHA values are constants (fixed) as defined + on top. + + # Arguments + img: the input image as a numpy array. + text: the text to be drawn. + topleft: XY coordinate of the topleft corner of the boxed text. + color: color of the patch, i.e. background of the text. + + # Output + img: note the original image is modified inplace. + """ + assert img.dtype == np.uint8 + img_h, img_w, _ = img.shape + if topleft[0] >= img_w or topleft[1] >= img_h: + return img + margin = 3 + size = cv2.getTextSize(text, FONT, TEXT_SCALE, TEXT_THICKNESS) + w = size[0][0] + margin * 2 + h = size[0][1] + margin * 2 + # the patch is used to draw boxed text + patch = np.zeros((h, w, 3), dtype=np.uint8) + patch[...] = color + cv2.putText(patch, text, (margin+1, h-margin-2), FONT, TEXT_SCALE, + WHITE, thickness=TEXT_THICKNESS, lineType=cv2.LINE_8) + cv2.rectangle(patch, (0, 0), (w-1, h-1), BLACK, thickness=1) + w = min(w, img_w - topleft[0]) # clip overlay at image boundary + h = min(h, img_h - topleft[1]) + # Overlay the boxed text onto region of interest (roi) in img + roi = img[topleft[1]:topleft[1]+h, topleft[0]:topleft[0]+w, :] + cv2.addWeighted(patch[0:h, 0:w, :], ALPHA, roi, 1 - ALPHA, 0, roi) + return img + + +class BBoxVisualization(): + """BBoxVisualization class implements nice drawing of boudning boxes. + + # Arguments + cls_dict: a dictionary used to translate class id to its name. + """ + + def __init__(self, cls_dict): + self.cls_dict = cls_dict + self.colors = gen_colors(len(cls_dict)) + + def draw_bboxes(self, img, box, conf, cls): + """Draw detected bounding boxes on the original image.""" + for bb, cf, cl in zip(box, conf, cls): + cl = int(cl) + y_min, x_min, y_max, x_max = bb[0], bb[1], bb[2], bb[3] + color = self.colors[cl] + cv2.rectangle(img, (x_min, y_min), (x_max, y_max), color, 2) + txt_loc = (max(x_min+2, 0), max(y_min+2, 0)) + cls_name = self.cls_dict.get(cl, 'CLS{}'.format(cl)) + txt = '{} {:.2f}'.format(cls_name, cf) + img = draw_boxed_text(img, txt, txt_loc, color) + return img