In [1]:
import pandas as pd
import numpy as np
import copy

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output
from PIL import Image
from tqdm import tqdm

import mlflow
import mlflow.pytorch

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import lr_scheduler

from sklearn.model_selection import train_test_split

import warnings

sns.set_style('whitegrid')
plt.rcParams.update({'font.size': 15})

%matplotlib inline
%config InlineBackend.figure_formats = ['svg']

warnings.filterwarnings('ignore')

In [2]:
class goodsDataset(Dataset):
    def __init__(self, df, classes_list, transform=None):
        """
        Arguments:
            df : pandas DataFrame.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.data_frame = df
        self.transform = transform
        self.classes_list = classes_list

    def encode_label(self, label):
        classes_list = list(self.classes_list)
        target = torch.zeros(len(classes_list))
        for l in label:
          idx = classes_list.index(l)
          target[idx] = 1
        return target

    def __len__(self):
        return self.data_frame.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        try:
            path = self.data_frame.iloc[idx, 0]
            image = Image.open(path).convert("RGB")

            if self.transform:
                image = self.transform(image)

            # label = torch.tensor()
            label = self.encode_label(self.data_frame.iloc[idx, 1])

            sample = [image, label]
            return sample

        except Exception as e:
            print(f"произошла ошибка в goodsDataset при загрузки картинки: {e}")
            return


def load_data(df,
              transform=None,
              batch_size=4,
              num_workers=0,
              classes_list=None,
              shuffle=True):

    goods_dataset = goodsDataset(df=df, 
                                transform=transform,
                                classes_list=classes_list)
    dataloader = DataLoader(
        goods_dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers
    )
    return dataloader


def train_model(model, train_loader, valid_loader, criterion, optimizer, num_epochs, name_file_save, device='cpu', scheduler=None, save=True):
    train_losses, train_accuracies = [], []
    valid_losses, valid_accuracies = [], []
    best_accuracy = 0.0
    best_weights = copy.deepcopy(model.state_dict())

    for epoch in range(1, num_epochs + 1):
        train_loss, train_accuracy = 0.0, 0.0
        model.train()
        for images, labels in tqdm(train_loader, desc='Training'):
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            logits = model(images)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * images.shape[0]
            train_accuracy += ((logits > 0.5) == labels).sum().item() / labels.numel()

        if scheduler is not None:
            scheduler.step()

        train_loss /= len(train_loader.dataset)
        train_accuracy /= len(train_loader)
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)

        if save:
            mlflow.pytorch.log_model(model, name_file_save)

        true_label = np.array([])
        predict_label = np.array([])
        test_loss, test_accuracy = 0.0, 0.0
        model.eval()
        for images, labels in tqdm(valid_loader, desc='Validating'):
            true_label = np.append(true_label, labels.cpu().numpy())
            images = images.to(device)
            labels = labels.to(device)

            with torch.no_grad():
                logits = model(images)
                loss = criterion(logits, labels)
                probabilities = torch.sigmoid(logits)
                predicted_class = probabilities > 0.5
                predict_label = np.append(predict_label, predicted_class.cpu().numpy())

            test_loss += loss.item() * images.shape[0]
            test_accuracy += ((logits > 0.5) == labels).sum().item() / labels.numel()

        test_loss /= len(valid_loader.dataset)
        test_accuracy /= len(valid_loader)
        valid_losses.append(test_loss)
        valid_accuracies.append(test_accuracy)

        if test_accuracy > best_accuracy:
            best_accuracy = test_accuracy
            best_weights = copy.deepcopy(model.state_dict())

        model.load_state_dict(best_weights)
        if save:
            mlflow.pytorch.log_model(model, name_file_save)

        mlflow.log_metrics({
            "train_losses": train_losses[-1],
            "valid_losses": valid_losses[-1],
            "train_accuracies": train_accuracies[-1],
            "valid_accuracies": valid_accuracies[-1]
        }, step=epoch)

    return train_losses, train_accuracies, valid_losses, valid_accuracies

# densenet121, IMAGENET1K_V1

In [3]:
from torchvision.models import DenseNet121_Weights, densenet121
weights = DenseNet121_Weights.IMAGENET1K_V1
model = densenet121(weights=weights)
transform = weights.transforms()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
torch.cuda.empty_cache()

In [4]:
df = pd.read_csv("plantvillage_annotation.csv", sep=';').drop('Unnamed: 2', axis=1)
df['id'] = df['id'].apply(lambda x: 'plants/' + x)
df['classes'] = df['classes'].apply(lambda x: eval(x))

In [5]:
df

Unnamed: 0,id,classes
0,plants/279b02e6-6c6e-43eb-abfa-171cd1bded41___...,[Grape___Leaf_blight_(Isariopsis_Leaf_Spot)]
1,plants/8dacdf2a-e031-4b21-b191-86e82078ef6d___...,[Grape___Leaf_blight_(Isariopsis_Leaf_Spot)]
2,plants/42032c9c-e401-452a-9b77-e19ee8cd7ed5___...,[Grape___Leaf_blight_(Isariopsis_Leaf_Spot)]
3,plants/54a17d93-22ce-4478-bb69-5d9320b34ad0___...,[Grape___Leaf_blight_(Isariopsis_Leaf_Spot)]
4,plants/ea7391a1-7052-4db6-bbd4-210fb75bd238___...,[Grape___Leaf_blight_(Isariopsis_Leaf_Spot)]
...,...,...
54300,plants/80b27199-5f5a-48e0-8dd0-f75cf23e1519___...,[Cherry_(including_sour)___Powdery_mildew]
54301,plants/eaa2a350-ce36-4b52-b259-d71d43ee2dda___...,[Cherry_(including_sour)___Powdery_mildew]
54302,plants/b696bdf0-4c75-40e6-a980-000f8d75c654___...,[Cherry_(including_sour)___Powdery_mildew]
54303,plants/945f0cfe-c6da-4164-92f6-900acad26611___...,[Cherry_(including_sour)___Powdery_mildew]


In [6]:
all_classes = df['classes'].apply(pd.Series).stack().tolist()
all_classes = set(all_classes)
all_classes = np.array(list(all_classes))

In [7]:
all_classes.size

38

In [8]:
all_classes

array(['Raspberry___healthy', 'Tomato___Late_blight', 'Peach___healthy',
       'Corn_(maize)___healthy', 'Potato___Early_blight',
       'Potato___healthy', 'Apple___healthy', 'Tomato___Bacterial_spot',
       'Tomato___Tomato_Yellow_Leaf_Curl_Virus',
       'Cherry_(including_sour)___healthy',
       'Corn_(maize)___Northern_Leaf_Blight',
       'Cherry_(including_sour)___Powdery_mildew', 'Grape___Black_rot',
       'Tomato___Target_Spot', 'Squash___Powdery_mildew',
       'Tomato___healthy', 'Pepper,_bell___healthy',
       'Blueberry___healthy',
       'Grape___Leaf_blight_(Isariopsis_Leaf_Spot)',
       'Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot',
       'Peach___Bacterial_spot', 'Corn_(maize)___Common_rust_',
       'Tomato___Septoria_leaf_spot', 'Strawberry___Leaf_scorch',
       'Tomato___Leaf_Mold', 'Tomato___Early_blight',
       'Tomato___Tomato_mosaic_virus', 'Apple___Apple_scab',
       'Orange___Haunglongbing_(Citrus_greening)', 'Soybean___healthy',
       'Apple

In [9]:
train, valid = train_test_split(df, test_size=0.2, random_state=42)

train_loader = load_data(
    train,
    transform,
    batch_size=50,
    classes_list=all_classes
)

valid_loader = load_data(
    valid,
    transform,
    batch_size=50,
    classes_list=all_classes,
    shuffle=False)

# здесь classifier может меняться, зависит от предуобченной модели
model.classifier = nn.Sequential(
    # здесь число первое может меняться, зависит от предуобченной модели
    nn.Linear(1024, 100),
    nn.ReLU(),
    nn.Linear(100, all_classes.shape[0])
)

# Замораживаем все слои
for param in model.parameters():
    param.requires_grad = False

# Размораживаем параметры последнего полносвязанного слоя (classifier)
for param in model.classifier.parameters():
    param.requires_grad = True

model = model.to(device)

In [10]:
# criterion = nn.CrossEntropyLoss().to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
num_epochs = 10

optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=0.001, weight_decay=0.01)

scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0, verbose=True)
experiment_name = "Plant_1"
mlflow.set_experiment(experiment_name)

name_model = "densenet121_10_mc"
with mlflow.start_run(run_name=name_model) as run:
    train_losses, train_accuracies, valid_accuracies, valid_accuracies = train_model(
        model,
        train_loader,
        valid_loader,
        criterion,
        optimizer,
        num_epochs,
        name_model,
        device=device,
        scheduler=scheduler,
        save=True
    )

Training: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 869/869 [03:22<00:00,  4.30it/s]
Validating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 218/218 [00:48<00:00,  4.52it/s]
Training: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 869/869 [03:19<00:00,  4.35it/s]
Validating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 218/218 [00:47<00:00,  4.59it/s]
Training: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [11]:
goods_dataset = goodsDataset(df=df, 
                                transform=transform,
                                classes_list=all_classes)

In [12]:
df["id"][1]

'plants/8dacdf2a-e031-4b21-b191-86e82078ef6d___FAM_L.Blight 1629.JPG'

In [13]:
goods_dataset[1][1]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])

In [49]:
df.iloc[1, 0]

'plants/8dacdf2a-e031-4b21-b191-86e82078ef6d___FAM_L.Blight 1629.JPG'

In [None]:
goods_dataset.encode_label(self, label)