In [1]:
import pandas as pd
import numpy as np
import copy

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output
from PIL import Image
from tqdm import tqdm

import mlflow
import mlflow.pytorch

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import lr_scheduler

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

import warnings

sns.set_style('whitegrid')
plt.rcParams.update({'font.size': 15})

%matplotlib inline
%config InlineBackend.figure_formats = ['svg']

warnings.filterwarnings('ignore')

In [2]:
class goodsDataset(Dataset):
    def __init__(self, df, classes_list, transform=None):
        """
        Arguments:
            df : pandas DataFrame.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.data_frame = df
        self.transform = transform
        self.classes_list = classes_list

    def encode_label(self, label):
        classes_list = list(self.classes_list)
        target = torch.zeros(len(classes_list))
        for l in label:
          idx = classes_list.index(l)
          target[idx] = 1
        return target

    def __len__(self):
        return self.data_frame.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        try:
            path = self.data_frame.iloc[idx, 0]
            image = Image.open(path).convert("RGB")

            if self.transform:
                image = self.transform(image)

            # label = torch.tensor()
            label = self.encode_label(self.data_frame.iloc[idx, 1])

            sample = [image, label]
            return sample

        except Exception as e:
            print(f"произошла ошибка в goodsDataset при загрузки картинки: {e}")
            return


def load_data(df,
              transform=None,
              batch_size=4,
              num_workers=0,
              classes_list=None,
              shuffle=True):

    goods_dataset = goodsDataset(df=df, 
                                transform=transform,
                                classes_list=classes_list)
    dataloader = DataLoader(
        goods_dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers
    )
    return dataloader


def train_model(model, train_loader, valid_loader, criterion, optimizer, num_epochs, name_file_save, device='cpu', scheduler=None, save=True):
    train_losses, train_accuracies = [], []
    valid_losses, valid_accuracies = [], []
    best_accuracy = 0.0
    best_weights = copy.deepcopy(model.state_dict())

    for epoch in range(1, num_epochs + 1):
        train_loss, train_accuracy = 0.0, 0.0
        model.train()
        for images, labels in tqdm(train_loader, desc='Training'):
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            logits = model(images)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * images.shape[0]
            train_accuracy += ((logits > 0.5) == labels).sum().item() / labels.numel()

        if scheduler is not None:
            scheduler.step()

        train_loss /= len(train_loader.dataset)
        train_accuracy /= len(train_loader)
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)

        if save:
            mlflow.pytorch.log_model(model, name_file_save)

        true_label = np.array([])
        predict_label = np.array([])
        test_loss, test_accuracy = 0.0, 0.0
        model.eval()
        for images, labels in tqdm(valid_loader, desc='Validating'):
            true_label = np.append(true_label, labels.cpu().numpy())
            images = images.to(device)
            labels = labels.to(device)

            with torch.no_grad():
                logits = model(images)
                loss = criterion(logits, labels)
                probabilities = torch.sigmoid(logits)
                predicted_class = probabilities > 0.5
                predict_label = np.append(predict_label, predicted_class.cpu().numpy())

            test_loss += loss.item() * images.shape[0]
            test_accuracy += ((logits > 0.5) == labels).sum().item() / labels.numel()

        test_loss /= len(valid_loader.dataset)
        test_accuracy /= len(valid_loader)
        valid_losses.append(test_loss)
        valid_accuracies.append(test_accuracy)

        if test_accuracy > best_accuracy:
            best_accuracy = test_accuracy
            best_weights = copy.deepcopy(model.state_dict())

        model.load_state_dict(best_weights)
        if save:
            mlflow.pytorch.log_model(model, name_file_save)

        mlflow.log_metrics({
            "train_losses": train_losses[-1],
            "valid_losses": valid_losses[-1],
            "train_accuracies": train_accuracies[-1],
            "valid_accuracies": valid_accuracies[-1],
            "f1_score_macro": f1_score(true_label, predict_label, average='macro'),
            "f1_score_micro": f1_score(true_label, predict_label, average='micro'),
            "f1_score_weighted": f1_score(true_label, predict_label, average='weighted')
        }, step=epoch)

    return train_losses, train_accuracies, valid_losses, valid_accuracies

# densenet121, IMAGENET1K_V1

In [3]:
from torchvision.models import DenseNet121_Weights, densenet121
weights = DenseNet121_Weights.IMAGENET1K_V1
model = densenet121(weights=weights)
transform = weights.transforms()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
torch.cuda.empty_cache()

In [4]:
df = pd.read_csv("plantdoc_annotation.csv", sep=';').drop('Unnamed: 2', axis=1)
df['id'] = df['id'].apply(lambda x: 'plantsdoc/' + x)
df['classes'] = df['classes'].apply(lambda x: eval(x))

In [5]:
df

Unnamed: 0,id,classes
0,plantsdoc/ga-2015-05-20-nclb.jpg,"[Corn, leaf blight]"
1,plantsdoc/corn-disease-update-fig-1-northern-l...,"[Corn, leaf blight]"
2,plantsdoc/corn-BLS-irregular-lesions.jpg,"[Corn, leaf blight]"
3,plantsdoc/186116-325x209-Northern-Corn-Leaf-Bl...,"[Corn, leaf blight]"
4,plantsdoc/nclb-2015-n-mcghee.jpg,"[Corn, leaf blight]"
...,...,...
2571,plantsdoc/1234080-Early-Blight.jpg,"[Tomato, Early blight leaf]"
2572,plantsdoc/rsz0803Figure6.jpg,"[Corn, Gray leaf spot]"
2573,plantsdoc/show_picture.asp_id=aaaaaaaaaaogcqq&...,"[Corn, Gray leaf spot]"
2574,plantsdoc/IMG_42231.jpg,"[Corn, Gray leaf spot]"


In [6]:
all_classes = df['classes'].apply(pd.Series).stack().tolist()
all_classes = set(all_classes)
all_classes = np.array(list(all_classes))

In [7]:
all_classes.size

28

In [8]:
all_classes

array(['Bell_pepper', 'Strawberry', 'Scab Leaf', 'leaf blight',
       'mold leaf', 'leaf early blight', 'leaf late blight', 'Cherry',
       'Gray leaf spot', 'grape', 'rust leaf', 'leaf mosaic virus',
       'Early blight leaf', 'Corn', 'leaf spot', 'leaf yellow virus',
       'Raspberry', 'Tomato', 'Blueberry', 'Septoria leaf spot',
       'leaf bacterial spot', 'Potato', 'Apple', 'Soyabean', 'Squash',
       'Peach', 'Powdery mildew leaf', 'leaf black rot'], dtype='<U19')

In [9]:
train, valid = train_test_split(df, test_size=0.2, random_state=42)

train_loader = load_data(
    train,
    transform,
    batch_size=50,
    classes_list=all_classes
)

valid_loader = load_data(
    valid,
    transform,
    batch_size=50,
    classes_list=all_classes,
    shuffle=False)

# здесь classifier может меняться, зависит от предуобченной модели
model.classifier = nn.Sequential(
    # здесь число первое может меняться, зависит от предуобченной модели
    nn.Linear(1024, 100),
    nn.ReLU(),
    nn.Linear(100, all_classes.shape[0])
)

# Замораживаем все слои
for param in model.parameters():
    param.requires_grad = False

# Размораживаем параметры последнего полносвязанного слоя (classifier)
for param in model.classifier.parameters():
    param.requires_grad = True

model = model.to(device)

In [10]:
# criterion = nn.CrossEntropyLoss().to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
num_epochs = 10

optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=0.001, weight_decay=0.01)

scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0, verbose=True)
experiment_name = "Plant_1"
mlflow.set_experiment(experiment_name)
mlflow.enable_system_metrics_logging()

name_model = "densenet121_10_PD_f1"
with mlflow.start_run(run_name=name_model) as run:
    train_losses, train_accuracies, valid_accuracies, valid_accuracies = train_model(
        model,
        train_loader,
        valid_loader,
        criterion,
        optimizer,
        num_epochs,
        name_model,
        device=device,
        scheduler=scheduler,
        save=True
    )

2024/06/07 18:42:42 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
Training: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [00:41<00:00,  1.02it/s]
Validating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:10<00:00,  1.02it/s]
Training: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [00:40<00:00,  1.05it/s]
Validating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:10<00:00,  1.03it/s]
Training: 100%|███████████████████

In [11]:
goods_dataset = goodsDataset(df=df, 
                                transform=transform,
                                classes_list=all_classes)

In [12]:
df["id"][1]

'plantsdoc/corn-disease-update-fig-1-northern-leaf-blight.jpg'

In [13]:
goods_dataset[1][1]

tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [14]:
df.iloc[1, 0]

'plantsdoc/corn-disease-update-fig-1-northern-leaf-blight.jpg'

In [None]:
goods_dataset.encode_label(self, label)