Skip to content

Commit 2aa165a

Browse files
wofmanafczczup
authored andcommitted
crowd_human detection
1 parent 76b26f0 commit 2aa165a

File tree

9 files changed

+1074
-1
lines changed

9 files changed

+1074
-1
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# dataset settings
2+
dataset_type = 'CrowdHumanDataset'
3+
data_root = 'data/CrowdHuman/'
4+
classes = ('person',)
5+
img_norm_cfg = dict(
6+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
7+
train_pipeline = [
8+
dict(type='LoadImageFromFile'),
9+
dict(type='LoadAnnotations', with_bbox=True),
10+
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
11+
dict(type='RandomFlip', flip_ratio=0.5),
12+
dict(type='Normalize', **img_norm_cfg),
13+
dict(type='Pad', size_divisor=32),
14+
dict(type='DefaultFormatBundle'),
15+
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
16+
]
17+
test_pipeline = [
18+
dict(type='LoadImageFromFile'),
19+
dict(
20+
type='MultiScaleFlipAug',
21+
img_scale=(1333, 800),
22+
flip=False,
23+
transforms=[
24+
dict(type='Resize', keep_ratio=True),
25+
dict(type='RandomFlip'),
26+
dict(type='Normalize', **img_norm_cfg),
27+
dict(type='Pad', size_divisor=32),
28+
dict(type='ImageToTensor', keys=['img']),
29+
dict(type='Collect', keys=['img']),
30+
])
31+
]
32+
data = dict(
33+
samples_per_gpu=2,
34+
workers_per_gpu=2,
35+
train=dict(
36+
type=dataset_type,
37+
classes=classes,
38+
filter_empty_gt=True,
39+
ann_file=data_root + 'annotations/annotation_train.json',
40+
img_prefix=data_root + 'Images',
41+
pipeline=train_pipeline),
42+
val=dict(
43+
type=dataset_type,
44+
classes=classes,
45+
ann_file=data_root + 'annotations/annotation_val.json',
46+
img_prefix=data_root + 'Images',
47+
pipeline=test_pipeline),
48+
test=dict(
49+
type=dataset_type,
50+
classes=classes,
51+
ann_file=data_root + 'annotations/annotation_val.json',
52+
img_prefix=data_root + 'Images',
53+
pipeline=test_pipeline))
54+
evaluation = dict(interval=100, metric='bbox')
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
# model settings
2+
model = dict(
3+
type='CascadeRCNN',
4+
backbone=dict(
5+
type='ResNet',
6+
depth=50,
7+
num_stages=4,
8+
out_indices=(0, 1, 2, 3),
9+
frozen_stages=1,
10+
norm_cfg=dict(type='BN', requires_grad=True),
11+
norm_eval=True,
12+
style='pytorch',
13+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
14+
neck=dict(
15+
type='FPN',
16+
in_channels=[256, 512, 1024, 2048],
17+
out_channels=256,
18+
num_outs=5),
19+
rpn_head=dict(
20+
type='RPNHead',
21+
in_channels=256,
22+
feat_channels=256,
23+
anchor_generator=dict(
24+
type='AnchorGenerator',
25+
scales=[8],
26+
ratios=[0.5, 1.0, 2.0],
27+
strides=[4, 8, 16, 32, 64]),
28+
bbox_coder=dict(
29+
type='DeltaXYWHBBoxCoder',
30+
target_means=[.0, .0, .0, .0],
31+
target_stds=[1.0, 1.0, 1.0, 1.0]),
32+
loss_cls=dict(
33+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
34+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
35+
roi_head=dict(
36+
type='CascadeRoIHead',
37+
num_stages=3,
38+
stage_loss_weights=[1, 0.5, 0.25],
39+
bbox_roi_extractor=dict(
40+
type='SingleRoIExtractor',
41+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
42+
out_channels=256,
43+
featmap_strides=[4, 8, 16, 32]),
44+
bbox_head=[
45+
dict(
46+
type='Shared2FCBBoxHead',
47+
in_channels=256,
48+
fc_out_channels=1024,
49+
roi_feat_size=7,
50+
num_classes=80,
51+
bbox_coder=dict(
52+
type='DeltaXYWHBBoxCoder',
53+
target_means=[0., 0., 0., 0.],
54+
target_stds=[0.1, 0.1, 0.2, 0.2]),
55+
reg_class_agnostic=True,
56+
loss_cls=dict(
57+
type='CrossEntropyLoss',
58+
use_sigmoid=False,
59+
loss_weight=1.0),
60+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
61+
loss_weight=1.0)),
62+
dict(
63+
type='Shared2FCBBoxHead',
64+
in_channels=256,
65+
fc_out_channels=1024,
66+
roi_feat_size=7,
67+
num_classes=80,
68+
bbox_coder=dict(
69+
type='DeltaXYWHBBoxCoder',
70+
target_means=[0., 0., 0., 0.],
71+
target_stds=[0.05, 0.05, 0.1, 0.1]),
72+
reg_class_agnostic=True,
73+
loss_cls=dict(
74+
type='CrossEntropyLoss',
75+
use_sigmoid=False,
76+
loss_weight=1.0),
77+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
78+
loss_weight=1.0)),
79+
dict(
80+
type='Shared2FCBBoxHead',
81+
in_channels=256,
82+
fc_out_channels=1024,
83+
roi_feat_size=7,
84+
num_classes=80,
85+
bbox_coder=dict(
86+
type='DeltaXYWHBBoxCoder',
87+
target_means=[0., 0., 0., 0.],
88+
target_stds=[0.033, 0.033, 0.067, 0.067]),
89+
reg_class_agnostic=True,
90+
loss_cls=dict(
91+
type='CrossEntropyLoss',
92+
use_sigmoid=False,
93+
loss_weight=1.0),
94+
loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
95+
],),
96+
# model training and testing settings
97+
train_cfg=dict(
98+
rpn=dict(
99+
assigner=dict(
100+
type='MaxIoUAssigner',
101+
pos_iou_thr=0.7,
102+
neg_iou_thr=0.3,
103+
min_pos_iou=0.3,
104+
match_low_quality=True,
105+
ignore_iof_thr=-1),
106+
sampler=dict(
107+
type='RandomSampler',
108+
num=256,
109+
pos_fraction=0.5,
110+
neg_pos_ub=-1,
111+
add_gt_as_proposals=False),
112+
allowed_border=0,
113+
pos_weight=-1,
114+
debug=False),
115+
rpn_proposal=dict(
116+
nms_pre=2000,
117+
max_per_img=2000,
118+
nms=dict(type='nms', iou_threshold=0.7),
119+
min_bbox_size=0),
120+
rcnn=[
121+
dict(
122+
assigner=dict(
123+
type='MaxIoUAssigner',
124+
pos_iou_thr=0.5,
125+
neg_iou_thr=0.5,
126+
min_pos_iou=0.5,
127+
match_low_quality=False,
128+
ignore_iof_thr=-1),
129+
sampler=dict(
130+
type='RandomSampler',
131+
num=512,
132+
pos_fraction=0.25,
133+
neg_pos_ub=-1,
134+
add_gt_as_proposals=True),
135+
mask_size=28,
136+
pos_weight=-1,
137+
debug=False),
138+
dict(
139+
assigner=dict(
140+
type='MaxIoUAssigner',
141+
pos_iou_thr=0.6,
142+
neg_iou_thr=0.6,
143+
min_pos_iou=0.6,
144+
match_low_quality=False,
145+
ignore_iof_thr=-1),
146+
sampler=dict(
147+
type='RandomSampler',
148+
num=512,
149+
pos_fraction=0.25,
150+
neg_pos_ub=-1,
151+
add_gt_as_proposals=True),
152+
mask_size=28,
153+
pos_weight=-1,
154+
debug=False),
155+
dict(
156+
assigner=dict(
157+
type='MaxIoUAssigner',
158+
pos_iou_thr=0.7,
159+
neg_iou_thr=0.7,
160+
min_pos_iou=0.7,
161+
match_low_quality=False,
162+
ignore_iof_thr=-1),
163+
sampler=dict(
164+
type='RandomSampler',
165+
num=512,
166+
pos_fraction=0.25,
167+
neg_pos_ub=-1,
168+
add_gt_as_proposals=True),
169+
mask_size=28,
170+
pos_weight=-1,
171+
debug=False)
172+
]),
173+
test_cfg=dict(
174+
rpn=dict(
175+
nms_pre=1000,
176+
max_per_img=1000,
177+
nms=dict(type='nms', iou_threshold=0.7),
178+
min_bbox_size=0),
179+
rcnn=dict(
180+
score_thr=0.05,
181+
nms=dict(type='nms', iou_threshold=0.5),
182+
max_per_img=100,
183+
mask_thr_binary=0.5)))
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# CrowdHuman
2+
3+
4+
## Introduction
5+
6+
Introduced by Shao et al. in [CrowdHuman: A Benchmark for Detecting Human in a Crowd](https://arxiv.org/pdf/1805.00123.pdf)
7+
8+
CrowdHuman is a benchmark dataset to better evaluate detectors in crowd scenarios. The CrowdHuman dataset is large, rich-annotated and contains high diversity. CrowdHuman contains 15000, 4370 and 5000 images for training, validation, and testing, respectively. There are a total of 470K human instances from train and validation subsets and 23 persons per image, with various kinds of occlusions in the dataset. Each human instance is annotated with a head bounding-box, human visible-region bounding-box and human full-body bounding-box. We hope our dataset will serve as a solid baseline and help promote future research in human detection tasks.
9+
10+
## Prepare the data
11+
Download the original dataset from [CrowdHuman](https://www.crowdhuman.org/download.html). Then convert annotations by detection/tools/create_crowd_anno.py
12+
13+
- Data Tree of CrowdHuman should look like:
14+
```bash
15+
$ tree CrowdHuman
16+
CrowdHuman
17+
├── annotations
18+
│ ├── annotation_train.json
19+
│ ├── annotation_train.odgt
20+
│ ├── annotation_val.json
21+
│ ├── annotation_val.odgt
22+
│ └── ...
23+
└── Images
24+
├── 1074488,79b360006b38332b.jpg
25+
├── 1074488,79d54000c6f9d9e5.jpg
26+
└── ...
27+
28+
## Model Zoo
29+
30+
31+
### Cascade Mask R-CNN + InternImage
32+
33+
34+
| backbone | schd | box mAP | mask mAP | train speed | train time | #param | FLOPs | Config | Download |
35+
| :------------: | :---------: |:-------:|:--------:|:-----------:|:-----------:|:------:|:-----:| :---: |:--------:|
36+
| InternImage-XL | 3x | TBD | TBD | TBD | TBD | TBD | TBD | [config](./cascade_internimage_xl_fpn_3x_crowd_human.py) | TBD |
37+
38+
- Training speed is measured with A100 GPUs using current code and may be faster than the speed in logs.
39+
- Some logs are our recent newly trained ones. There might be slight differences between the results in logs and our paper.
40+

0 commit comments

Comments
 (0)