在本教程中,您將學習如何使用經典的one-stage目標檢測網絡Yolo v3來實現口罩檢測,關于Yolo v3的資料可以閱讀paper。
進入控制臺Notebook頁面,單擊 創建Notebook實例按鈕。
進入我的Notebook頁面,在 操作列單擊 打開Notebook。
進入Notebook詳情頁面,單擊 打開Notebook。
import cv2import mathimport matplotlib.pyplot as pltimport numpy as npimport osimport randomimport timeimport torchimport torchvisionimport torch.nn as nnimport torch.nn.init as initimport torch.optim as optimimport xml.etree.ElementTree as ET from torch.utils.data import Dataset, DataLoader
下載口罩檢測數據集并上傳到Notebook服務器,這里我們以AIZOO開源數據集為例,下載地址: https://pan.baidu.com/s/1nsQf_Py5YyKm87-8HiyJeQ ,提取碼:eyfz
VOC_CLASSES = ('face', 'face_mask')class AnnotationTransform(object): def __init__(self, class_to_ind=None, keep_difficult=True): self.class_to_ind = class_to_ind or dict( zip(VOC_CLASSES, range(len(VOC_CLASSES)))) self.keep_difficult = keep_difficult def __call__(self, target): res = np.empty((0,5)) for obj in target.iter('object'): difficult = int(obj.find('difficult').text) == 1 if not self.keep_difficult and difficult: continue name = obj.find('name').text.lower().strip() bbox = obj.find('bndbox') pts = ['xmin', 'ymin', 'xmax', 'ymax'] bndbox = [] for i, pt in enumerate(pts): cur_pt = int(bbox.find(pt).text) - 1 bndbox.append(cur_pt) label_idx = self.class_to_ind[name] bndbox.append(label_idx) res = np.vstack((res, bndbox)) # [xmin, ymin, xmax, ymax, label_ind] return res # [[xmin, ymin, xmax, ymax, label_ind], ... ]def preproc_for_test(image, input_size, mean, std): interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4] interp_method = interp_methods[random.randrange(5)] image = cv2.resize(image, input_size, interpolation=interp_method) image = image.astype(np.float32) image = image[:, :, ::-1] image /= 255. if mean is not None: image -= mean if std is not None: image /= std return image.transpose(2, 0, 1)class TrainTransform(object): def __init__(self, rgb_means=None, std=None, max_labels=50): self.means = rgb_means self.std = std self.max_labels = max_labels def __call__(self, image, targets, img_size): boxes = targets[:, :4].copy() # Nx4 labels = targets[:, 4].copy() if len(boxes) == 0: targets = np.zeros((self.max_labels, 5), dtype=np.float32) image = preproc_for_test(image, img_size, self.means, self.std) image = np.ascontiguousarray(image, dtype=np.float32) return torch.from_numpy(image), torch.from_numpy(targets) height, width, _ = image.shape boxes_o = targets[:, :4] labels = targets[:, 4] b_x_o = (boxes_o[:, 2] + boxes_o[:, 0]) * .5 b_y_o = (boxes_o[:, 3] + boxes_o[:, 1]) * .5 b_w_o = (boxes_o[:, 2] - boxes_o[:, 0]) * 1. b_h_o = (boxes_o[:, 3] - boxes_o[:, 1]) * 1. boxes_o[:, 0] = b_x_o boxes_o[:, 1] = b_y_o boxes_o[:, 2] = b_w_o boxes_o[:, 3] = b_h_o # resize interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4] interp_method = interp_methods[random.randrange(5)] image_t = cv2.resize(image, img_size, interpolation=interp_method) boxes = boxes_o boxes[:, 0::2] /= width boxes[:, 1::2] /= height boxes[:, 0::2] *= img_size[0] boxes[:, 1::2] *= img_size[1] image_t = preproc_for_test(image_t, img_size, self.means, self.std) labels = np.expand_dims(labels, 1) targets_t = np.hstack((labels, boxes)) padded_labels = np.zeros((self.max_labels, 5)) padded_labels[range(len(targets_t))[:self.max_labels]] = targets_t[:self.max_labels] padded_labels = np.ascontiguousarray(padded_labels, dtype=np.float32) image_t = np.ascontiguousarray(image_t, dtype=np.float32) return torch.from_numpy(image_t), torch.from_numpy(padded_labels) # 數據集類型定義class VOCDetection(Dataset): def __init__(self, root, preproc=None, target_transform=AnnotationTransform(), img_size=(416, 416), split='train'): super().__init__() self.root = root self.preproc = preproc self.target_transform = target_transform self.img_size = img_size self._annopath = os.path.join('%s', 'Annotations', '%s.xml') self._imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg') self._classes = VOC_CLASSES self._year = '2012' # options: '2007', which is related to eval protocol self.item_container = set() if split == 'train': for folder in ['part1', 'part2']: for item in os.listdir(os.path.join(self.root, folder)): self.item_container.add(os.path.join(self.root, folder, item[:-4])) else: for folder in ['val']: for item in os.listdir(os.path.join(self.root, folder)): self.item_container.add(os.path.join(self.root, folder, item[:-4])) self.item_container = list(self.item_container) def __getitem__(self, index): item = self.item_container[index] target = ET.parse(item+'.xml').getroot() img = cv2.imread(item+'.jpg') # img = Image.open(self._imgpath % img_id).convert('RGB') height, width, _ = img.shape if self.target_transform is not None: target = self.target_transform(target) if self.preproc is not None: img, target = self.preproc(img, target, self.img_size) img_info = (width, height) return img, target, img_info, item def __len__(self): return len(self.item_container) dataset = VOCDetection(root='./', preproc=TrainTransform(),split='train')
from yolo import YOLOv3 model = YOLOv3(num_classes = len(VOC_CLASSES))def init_yolo(M): for m in M.modules(): if isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight, a=0.1, mode='fan_in') if m.bias is not None: init.zeros_(m.bias) elif isinstance(m, nn.BatchNorm2d): init.ones_(m.weight) init.zeros_(m.bias) elif isinstance(m, nn.Linear): init.normal_(m.weight, 0, 0.01) init.zeros_(m.bias) m.state_dict()[key][...] = 0model.apply(init_yolo)model.train()torch.backends.cudnn.benchmark = Truedevice = torch.device("cuda")model = model.to(device)
# 在訓練之前我們先定義一些超參數batch_size = 8 # 每一批次訓練大小,不宜太小,大小受GPU顯存限制base_lr = 0.0001 # 基準學習率warmup_epochs = 10 # 學習率逐漸增加到base_lr的epochepochs = 70 # 總共訓練epoch數save_interval = 10 # 保存模型的epoch間隔steps = [50, 60] # 學習率減少的epochdataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)optimizer = optim.Adam(model.parameters(), lr=base_lr, weight_decay=0.0005)epoch_size = len(dataset) // (batch_size*1)epoch = 1def set_lr(tmp_lr): for param_group in optimizer.param_groups: param_group['lr'] = tmp_lrwhile epoch < epochs+1: print('\n[Epoch {} started]'.format(epoch)) for iter_i, (imgs, targets, _, _) in enumerate(dataloader): start = time.time() if epoch % save_interval == 0: torch.save(model.state_dict(), 'yolov3_mask_detection_{}.pth'.format(epoch)) # 更新學習率 if epoch < warmup_epochs: tmp_lr = base_lr * pow((iter_i+epoch*epoch_size)*1. / (warmup_epochs*epoch_size), 1) set_lr(tmp_lr) elif epoch == warmup_epochs: tmp_lr = base_lr set_lr(tmp_lr) elif epoch in steps and iter_i == 0: tmp_lr = tmp_lr * 0.1 set_lr(tmp_lr) optimizer.zero_grad() imgs = imgs.to(device).to(torch.float32) targets = targets.to(device).to(torch.float32) loss_dict = model(imgs, targets, epoch) loss = sum(loss for loss in loss_dict['losses']) loss.backward() optimizer.step() end = time.time() if iter_i % 1 == 0: # 打印訓練過程信息 print('\r[Epoch %d/%d][Iter %d/%d][LR %.6f]' '[Loss: l1 %.2f, conf %.6f, cls %.6f][Time: %.2f s]......' % (epoch, epochs, iter_i+1, epoch_size, tmp_lr, sum(l1_loss for l1_loss in loss_dict['l1_losses']).item(), sum(conf_loss for conf_loss in loss_dict['conf_losses']).item(), sum(cls_loss for cls_loss in loss_dict['cls_losses']).item(), end-start), end='') epoch += 1torch.save(model.state_dict(), 'yolov3_mask_detection_final.pth'.format(epoch))
class ValTransform(object): def __init__(self, rgb_means=None, std=None, swap=(2, 0, 1)): self.means = rgb_means self.swap = swap self.std = std def __call__(self, img, res, input_size): interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4] interp_method = interp_methods[0] img = cv2.resize(np.array(img), input_size, interpolation=interp_method).astype(np.float32) img = img[:, :, ::-1] img /= 255. if self.means is not None: img -= self.means if self.std is not None: img /= self.std img = img.transpose(self.swap) img = np.ascontiguousarray(img, dtype=np.float32) return torch.from_numpy(img), torch.zeros(1, 5) transform = ValTransform()im = cv2.imread("val/test_00000760.jpg") # 輸入的圖片ori_im = im.copy()height, width, _ = im.shape test_size = (416, 416)im_input, _ = transform(im, None, test_size)im_input = im_input.to(device).type(torch.float32).unsqueeze(0)model.load_state_dict(torch.load('yolov3_mask_detection_final.pth')) # 加載訓練權重device = torch.device("cuda")model = model.to(device)model.eval()outputs = model(im_input)
def postprocess(prediction, num_classes=2, conf_thre=0.3, nms_thre=0.45): box_corner = prediction.new(prediction.shape) box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2 box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2 prediction[:, :, :4] = box_corner[:, :, :4] output = [None for _ in range(len(prediction))] for i, image_pred in enumerate(prediction): # If none are remaining => process next image if not image_pred.size(0): continue # Get score and class with highest confidence class_conf, class_pred = torch.max( image_pred[:, 5:5 + num_classes], 1, keepdim=True) conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze() # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred) detections = torch.cat( (image_pred[:, :5], class_conf, class_pred.float()), 1) detections = detections[conf_mask] if not detections.size(0): continue # Iterate through all predicted classes unique_labels = detections[:, -1].unique() for c in unique_labels: # Get the detections with the particular class detections_class = detections[detections[:, -1] == c] nms_out_index = torchvision.ops.nms( detections_class[:, :4], detections_class[:, 4]*detections_class[:, 5], nms_thre) detections_class = detections_class[nms_out_index] if output[i] is None: output[i] = detections_class else: output[i] = torch.cat((output[i], detections_class)) return output outputs = postprocess(outputs, 2, 0.01, 0.35)outputs = outputs[0].cpu().data bboxes = outputs[:, 0:4]bboxes[:, 0::2] *= width / test_size[0]bboxes[:, 1::2] *= height / test_size[1]bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]cls = outputs[:, 6]scores = outputs[:, 4] * outputs[:, 5]
def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None, color=None): colors = torch.FloatTensor([[1,0,1],[0,0,1],[0,1,1],[0,1,0],[1,1,0],[1,0,0]]); def get_color(c, x, max_val): ratio = float(x)/max_val * 5 i = int(math.floor(ratio)) j = int(math.ceil(ratio)) ratio = ratio - i r = (1-ratio) * colors[i][c] + ratio*colors[j][c] return int(r*255) width = img.shape[1] height = img.shape[0] for i in range(len(boxes)): box = boxes[i] cls_conf = scores[i] if cls_conf < conf: continue x1 = int(box[0]) y1 = int(box[1]) x2 = int(box[0]+box[2]) y2 = int(box[1]+box[3]) if color: rgb = color else: rgb = (255, 0, 0) if class_names is not None: cls_conf = scores[i] cls_id = int(cls_ids[i]) class_name = class_names[cls_id] classes = len(class_names) offset = cls_id * 123456 % classes red = get_color(2, offset, classes) green = get_color(1, offset, classes) blue = get_color(0, offset, classes) if color is None: rgb = (red, green, blue) img = cv2.putText(img, '%s: %.2f'%(class_name,cls_conf), (x1,y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, rgb, 2) img = cv2.rectangle(img, (x1,y1), (x2,y2), rgb, 1) return img pred_im = vis(ori_im, bboxes.numpy(), scores.numpy(), cls.numpy(), conf=0.3, class_names=VOC_CLASSES)plt.rcParams['figure.figsize'] = (20, 12)plt.imshow(pred_im[:,:,::-1])
