FCN | Notion

<aside> 💡 파스칼 VOC 2012 챌린지 데이터세트를 활용해 FCN 모델을 미세 조정

</aside>

이미지 분류, 객체 검출 및 의미론적 분할을 위한 이미지와 해당 이미지에 대한 객체의 레이블로 구성

레이블은 20개의 객체 클래스와 배경 클래스로 구성

약 1464장의 학습 이미지와 1449장의 검증 이미지를 포함하며 의미론적 분할을 위한 벤치마크 데이터세트로 활용

세그멘테이션 모델 학습을 위한 정보를 제공

어노테이션 정보 XML 파일(Annotations)
특정 모델 학습 작업을 위한 파일 정보가 담긴 TXT 파일(ImageSets)
원본 이미지 파일(JPEGImages)
의미론적 분할 이미지 파일(SegmentationClass)
인스턴스 세그멘테이션 이미지 파일(SegmentationObject)

예제에서는 세그멘테이션 모델을 구성할 예정이므로 ImageSets 폴더의 Segmentation에 정리된 train.txt와 val.txt를 활용해 이미지를 불러온다.

trainval.txt = train.txt + val.txt

FCN 모델 학습을 위한 원본 이미지와 마스크 이미지는 각각 JPEGImages, SegmentationClass에 담겨있다.(동일한 파일명으로 구성됨)

의미론적 분할 이미지 파일은 객체의 클래스를 구분하기 위해 동일한 클래스는 동일 색상으로 구분한다. 모든 인스턴스를 구분해야 한다면 SegmentationObject를 사용해야 한다.

이미지에서 레이블링된 픽셀들은 서로 다른 색상으로 표시되며, 각 클래스에 대한 색상 코드는 파스칼 데이터세트에서 확인할 수 있다.

이 책에서는 편의를 위해 색상 코드와 클래스가 정리된 classes.json 파일을 제공한다.

각 물체의 경계선에 4px 정도 오프셋이 적용된다. 두 클래스 간의 경계를 정확하게 나타내기 위해 추가된다.

import os
import json
import torch
import numpy as np
from PIL import Image
from torch.utils.data import Dataset

class SegmentationDataset(Dataset):
    def __init__(self, root, train, transform=None, target_transform=None):
        super().__init__()
        self.root = os.path.join(root, "VOCdevkit", "VOC2012")
        file_type = "train" if train else "val"
        file_path = os.path.join(
            self.root, "ImageSets", "Segmentation", f"{file_type}.txt"
        )
        with open(os.path.join(self.root, "classes.json"), "r") as file:
            self.categories = json.load(file)
        self.files = open(file_path).read().splitlines()
        self.transform = transform
        self.target_transform = target_transform
        self.data = self._load_data()
        

    def _load_data(self):
        data = []
        for file in self.files:
            image_path = os.path.join(self.root, "JPEGImages", f"{file}.jpg")
            mask_path = os.path.join(self.root, "SegmentationClass", f"{file}.png")
            image = Image.open(image_path).convert("RGB")
            mask = np.array(Image.open(mask_path))
            mask = np.where(mask == 255, 0, mask)
            target = torch.LongTensor(mask).unsqueeze(0)
            data.append([image, target])
        return data

    def __getitem__(self, index):
        image, mask = self.data[index]
        if self.transform is not None:
            image = self.transform(image)
        if self.target_transform is not None:
            mask = self.target_transform(mask)
        return image, mask

    def __len__(self):
        return len(self.data)
from torchvision import transforms
from torch.utils.data import DataLoader

transform = transforms.Compose(
    [
        transforms.PILToTensor(),
        transforms.ConvertImageDtype(dtype=torch.float),
        transforms.Resize(size=(224, 224))
    ]
)
target_transform = transforms.Compose(
    [
        transforms.Resize(
            size=(224, 224),
            interpolation=transforms.InterpolationMode.NEAREST
        )
    ]
)

train_dataset = SegmentationDataset(
    "../datasets", train=True, transform=transform, target_transform=target_transform
)
test_dataset = SegmentationDataset(
    "../datasets", train=False, transform=transform, target_transform=target_transform
)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True, drop_last=True)
import matplotlib.pyplot as plt

def draw_mask(images, masks, outputs=None, plot_size=4):
    def color_mask(image, target):
        m = target.squeeze().numpy().astype(np.uint8)
        cm = np.zeros_like(image, dtype=np.uint8)
        
        for i in range(1, 21):
            cm[m == i] = train_dataset.categories[str(i)]["color"]
        
        classes = [train_dataset.categories[str(idx)]["class"] for idx in np.unique(m)]
        return cm, classes
        
    col = 3 if outputs is not None else 2
    figsize = 20 if outputs is not None else 28
    fig, ax = plt.subplots(plot_size, col, figsize=(14, figsize), constrained_layout=True)
    
    for batch in range(plot_size):
        im = images[batch].numpy().transpose(1, 2, 0)
        ax[batch][0].imshow(im)
        ax[batch][0].axis("off")
   
        cm, classes = color_mask(im, masks[batch])
        ax[batch][1].set_title(classes)
        ax[batch][1].imshow(cm)
        ax[batch][1].axis("off")

        if outputs is not None:
            cm, classes = color_mask(im, outputs[batch])
            ax[batch][2].set_title(classes)
            ax[batch][2].imshow(cm)
            ax[batch][2].axis("off")

images, masks = next(iter(train_dataloader))
draw_mask(images, masks, plot_size=4)
from torch import nn
from torch import optim
from torchvision.models import segmentation

num_classes = 21
device = "cuda" if torch.cuda.is_available() else "cpu"
model = segmentation.fcn_resnet50(
    weight="FCN_ResNet50_Weights.COCO_WITH_VOC_LABELS_V1",
    num_classes=21
).to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.SGD(params, lr=0.001, momentum=0.9, weight_decay=0.0005)
criterion = nn.CrossEntropyLoss()
for epoch in range(30):
    model.train()
    cost = 0.0

    for images, targets in train_dataloader:
        images = images.to(device)
        targets = targets.to(device)

        output = model(images)
        output = output["out"].permute(0, 2, 3, 1).contiguous().view(-1, num_classes)
        targets = targets.permute(0, 2, 3, 1).contiguous().view(-1)

        loss = criterion(output, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        cost += loss
    cost = cost / len(train_dataloader)
    print(f"Epoch : {epoch+1:4d}, Cost : {cost:.3f}")
with torch.no_grad():
    model.eval()
    images, masks = next(iter(test_dataloader))
    outputs = model(images.to(device))["out"]
    outputs = outputs.argmax(axis=1).to("cpu")
    draw_mask(images, masks, outputs, 4)
from collections import defaultdict

def calculate_iou(targets, outputs, ious, class_count, num_classes=21):
    for i in range(num_classes):
        intersection = np.float32(np.sum((outputs == targets) * (targets == i)))
        union = np.sum(targets == i) + np.sum(outputs == i) - intersection
        if union > 0:
            ious[i] += intersection / union
            class_count[i] += 1
    return ious, class_count

ious = np.zeros(21)
class_count = defaultdict(int)
with torch.no_grad():
    model.eval()
    for images, targets in test_dataloader:
        images = images.to(device)
        outputs = model(images)["out"].permute(0, 2, 3, 1).detach().to("cpu").numpy()
        targets = targets.permute(0, 2, 3, 1).squeeze().detach().to("cpu").numpy()
        outputs = outputs.argmax(-1)

        ious, class_count = calculate_iou(targets, outputs, ious, class_count, 21)

miou = 0.0
for idx in range(1, 21):
    miou += ious[idx] / class_count[idx]
miou /= 20
print(f"mIoU 계산 결과 : {miou}")