In [1]:
import pandas as pd
import numpy as np
import copy

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output
from PIL import Image
from tqdm import tqdm

import mlflow
import mlflow.pytorch

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import lr_scheduler

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

import warnings

sns.set_style('whitegrid')
plt.rcParams.update({'font.size': 15})

%matplotlib inline
%config InlineBackend.figure_formats = ['svg']

warnings.filterwarnings('ignore')

In [2]:
class goodsDataset(Dataset):
    def __init__(self, df, classes_list, transform=None):
        """
        Arguments:
            df : pandas DataFrame.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.data_frame = df
        self.transform = transform
        self.classes_list = classes_list

    def encode_label(self, label):
        classes_list = list(self.classes_list)
        target = torch.zeros(len(classes_list))
        for l in label:
          idx = classes_list.index(l)
          target[idx] = 1
        return target

    def __len__(self):
        return self.data_frame.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        try:
            path = self.data_frame.iloc[idx, 0]
            image = Image.open(path).convert("RGB")

            if self.transform:
                image = self.transform(image)

            # label = torch.tensor()
            label = self.encode_label(self.data_frame.iloc[idx, 1])

            sample = [image, label]
            return sample

        except Exception as e:
            print(f"произошла ошибка в goodsDataset при загрузки картинки: {e}")
            return


def load_data(df,
              transform=None,
              batch_size=4,
              num_workers=0,
              classes_list=None,
              shuffle=True):

    goods_dataset = goodsDataset(df=df, 
                                transform=transform,
                                classes_list=classes_list)
    dataloader = DataLoader(
        goods_dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers
    )
    return dataloader


def train_model(model, train_loader, valid_loader, criterion, optimizer, num_epochs, name_file_save, device='cpu', scheduler=None, save=True):
    train_losses, train_accuracies = [], []
    valid_losses, valid_accuracies = [], []
    best_accuracy = 0.0
    best_weights = copy.deepcopy(model.state_dict())

    for epoch in range(1, num_epochs + 1):
        train_loss, train_accuracy = 0.0, 0.0
        model.train()
        for images, labels in tqdm(train_loader, desc='Training'):
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            logits = model(images)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * images.shape[0]
            train_accuracy += ((logits > 0.5) == labels).sum().item() / labels.numel()

        if scheduler is not None:
            scheduler.step()

        train_loss /= len(train_loader.dataset)
        train_accuracy /= len(train_loader)
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)

        if save:
            mlflow.pytorch.log_model(model, name_file_save)

        true_label = np.array([])
        predict_label = np.array([])
        test_loss, test_accuracy = 0.0, 0.0
        model.eval()
        for images, labels in tqdm(valid_loader, desc='Validating'):
            true_label = np.append(true_label, labels.cpu().numpy())
            images = images.to(device)
            labels = labels.to(device)

            with torch.no_grad():
                logits = model(images)
                loss = criterion(logits, labels)
                probabilities = torch.sigmoid(logits)
                predicted_class = probabilities > 0.5
                predict_label = np.append(predict_label, predicted_class.cpu().numpy())

            test_loss += loss.item() * images.shape[0]
            test_accuracy += ((logits > 0.5) == labels).sum().item() / labels.numel()

        test_loss /= len(valid_loader.dataset)
        test_accuracy /= len(valid_loader)
        valid_losses.append(test_loss)
        valid_accuracies.append(test_accuracy)

        if test_accuracy > best_accuracy:
            best_accuracy = test_accuracy
            best_weights = copy.deepcopy(model.state_dict())

        model.load_state_dict(best_weights)
        if save:
            mlflow.pytorch.log_model(model, name_file_save)

        mlflow.log_metrics({
            "train_losses": train_losses[-1],
            "valid_losses": valid_losses[-1],
            "train_accuracies": train_accuracies[-1],
            "valid_accuracies": valid_accuracies[-1],
            "f1_score_macro": f1_score(true_label, predict_label, average='macro'),
            "f1_score_micro": f1_score(true_label, predict_label, average='micro'),
            "f1_score_weighted": f1_score(true_label, predict_label, average='weighted')
        }, step=epoch)

    return train_losses, train_accuracies, valid_losses, valid_accuracies

# efficientnet_b0, IMAGENET1K_V1

In [3]:
from torchvision.models import EfficientNet_B0_Weights, efficientnet_b0
weights = EfficientNet_B0_Weights.IMAGENET1K_V1
model = efficientnet_b0(weights=weights)
transform = weights.transforms()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
torch.cuda.empty_cache()

In [4]:
df = pd.read_csv("plantdoc_annotation.csv", sep=';').drop('Unnamed: 2', axis=1)
df['id'] = df['id'].apply(lambda x: 'plantsdoc/' + x)
df['classes'] = df['classes'].apply(lambda x: eval(x))

In [5]:
df

Unnamed: 0,id,classes
0,plantsdoc/B9230061-Powdery_Mildew_on_Squash_Le...,"[Squash, Powdery mildew]"
1,plantsdoc/16488048256_f64cfea40e_z.jpg,"[Squash, Powdery mildew]"
2,plantsdoc/87985926-1-56a349f35f9b58b7d0d14d82.jpg,"[Squash, Powdery mildew]"
3,plantsdoc/53cf2cffb466d325a61897ef320cfccf.jpg,"[Squash, Powdery mildew]"
4,plantsdoc/flora-ivy-plant-food-produce-vegetab...,"[Squash, Powdery mildew]"
...,...,...
2564,plantsdoc/grape_F15a.jpg,"[Grape, Black rot]"
2565,plantsdoc/Black%20rot%20on%20foliage.jpg,"[Grape, Black rot]"
2566,plantsdoc/5-29black-rot-chardRR.jpg,"[Grape, Black rot]"
2567,plantsdoc/Black%20rot%20on%20foliage2.jpg,"[Grape, Black rot]"


In [6]:
all_classes = df['classes'].apply(pd.Series).stack().tolist()
all_classes = set(all_classes)
all_classes = np.array(list(all_classes))

In [7]:
all_classes.size

27

In [8]:
all_classes

array(['Black rot', 'Mosaic virus', 'Yellow leaf virus', 'Potato',
       'Septoria leaf spot', 'Bell_pepper', 'Tomato', 'Late blight',
       'Squash', 'Raspberry', 'Grape', 'mold leaf', 'leaf blight', 'Scab',
       'Early blight', 'Corn', 'Blueberry', 'Healthy', 'Strawberry',
       'Bacterial spot', 'Rust', 'Soyabean', 'Cherry', 'Powdery mildew',
       'Peach', 'Apple', 'Gray leaf spot'], dtype='<U18')

In [9]:
train, valid = train_test_split(df, test_size=0.4, random_state=42)

valid, test = train_test_split(df, test_size=0.5, random_state=42)

train_loader = load_data(
    train,
    transform,
    batch_size=50,
    classes_list=all_classes
)

valid_loader = load_data(
    valid,
    transform,
    batch_size=50,
    classes_list=all_classes,
    shuffle=False)

test_loader = load_data(
    test,
    transform,
    batch_size=50,
    classes_list=all_classes,
    shuffle=False)

# здесь classifier может меняться, зависит от предуобченной модели
model.classifier = nn.Sequential(
    # здесь число первое может меняться, зависит от предуобченной модели
    nn.Linear(1280, 100),
    nn.ReLU(),
    nn.Linear(100, all_classes.shape[0])
)

# Замораживаем все слои
for param in model.parameters():
    param.requires_grad = False

# Размораживаем параметры последнего полносвязанного слоя (classifier)
for param in model.classifier.parameters():
    param.requires_grad = True

model = model.to(device)

In [10]:
# criterion = nn.CrossEntropyLoss().to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
num_epochs = 10

optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=0.001, weight_decay=0.01)

scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0, verbose=True)
experiment_name = "Plant_1"
mlflow.set_experiment(experiment_name)
mlflow.enable_system_metrics_logging()

name_model = "efficientnet_b0_10_PD_test_f1"
with mlflow.start_run(run_name=name_model) as run:
    train_losses, train_accuracies, valid_accuracies, valid_accuracies = train_model(
        model,
        train_loader,
        valid_loader,
        criterion,
        optimizer,
        num_epochs,
        name_model,
        device=device,
        scheduler=scheduler,
        save=True
    )

2025/03/13 01:23:26 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
Training: 100%|█████████████████████████████████| 31/31 [00:36<00:00,  1.16s/it]
Validating: 100%|███████████████████████████████| 26/26 [00:29<00:00,  1.14s/it]
Training: 100%|█████████████████████████████████| 31/31 [00:35<00:00,  1.14s/it]
Validating: 100%|███████████████████████████████| 26/26 [00:29<00:00,  1.14s/it]
Training: 100%|█████████████████████████████████| 31/31 [00:35<00:00,  1.14s/it]
Validating: 100%|███████████████████████████████| 26/26 [00:29<00:00,  1.14s/it]
Training: 100%|█████████████████████████████████| 31/31 [00:35<00:00,  1.15s/it]
Validating: 100%|███████████████████████████████| 26/26 [00:29<00:00,  1.15s/it]
Training: 100%|█████████████████████████████████| 31/31 [00:35<00:00,  1.14s/it]
Validating: 100%|███████████████████████████████| 26/26 [00:29<00:00,  1.14s/it]
Training: 100%|█████████████████████████████████| 31/31 [00:35<00:00,  1.14s/it]
Val

In [91]:
goods_dataset = goodsDataset(df=df, 
                                transform=transform,
                                classes_list=all_classes)

In [92]:
df["id"][1]

'plantsdoc/16488048256_f64cfea40e_z.jpg'

In [93]:
goods_dataset[1][1]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0.])

In [94]:
df.iloc[1, 0]

'plantsdoc/16488048256_f64cfea40e_z.jpg'

In [95]:
#goods_dataset.encode_label(self, label)

In [96]:
#device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

image = Image.open('plantsdoc/16488048256_f64cfea40e_z.jpg').convert("RGB")  # Открываем изображение и конвертируем в RGB
image = transform(image).unsqueeze(0)

image = image.to(device) 
model = model.to(device)
#image = transform(image)

In [97]:
true_labels=goods_dataset[1][1].squeeze().cpu().numpy()

In [98]:
class_names = ['Apple', 'Bell_pepper', 'Powdery mildew', 'Early blight', 'Corn',
'Septoria leaf spot', 'Cherry', 'Healthy', 'Blueberry',
'Yellow leaf virus', 'Raspberry', 'Rust', 'Potato', 'leaf blight',
'Mosaic virus', 'Late blight', 'Scab', 'Peach', 'Grape',
'mold leaf', 'Gray leaf spot', 'Strawberry', 'Soyabean', 'Tomato',
'Black rot', 'Bacterial spot', 'Squash']


In [99]:
class_names = all_classes

In [100]:
f = open('plant_list.txt')
plants=[]
for line in f:
    plants.append(line.rstrip())

print(plants)
print(len(plants))

['Apple', 'Blueberry', 'Cherry', 'Corn', 'Grape', 'Peach', 'Bell_pepper', 'Potato', 'Raspberry', 'Soyabean', 'Squash', 'Strawberry', 'Tomato']
13


In [101]:
f = open('disease.txt')
diseases=[]
for line in f:
    diseases.append(line.rstrip())

print(diseases,len(diseases))

['Healthy', 'leaf blight', 'mold leaf', 'Rust', 'Septoria leaf spot', 'Late blight', 'Bacterial spot', 'Mosaic virus', 'Yellow leaf virus', 'Powdery mildew', 'Black rot', 'Scab', 'Early blight', 'Gray leaf spot'] 14


In [102]:
class_plants=list(set(class_names) & set(plants))

print(class_plants,len(class_plants))

['Corn', 'Soyabean', 'Strawberry', 'Cherry', 'Blueberry', 'Squash', 'Raspberry', 'Peach', 'Potato', 'Grape', 'Apple', 'Bell_pepper', 'Tomato'] 13


In [103]:
model.eval()  # Переключение модели в режим оценки

EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat

In [104]:
with torch.no_grad():
    outputs = model(image)  # Получаем логиты
    probabilities = torch.sigmoid(outputs)  # Применяем sigmoid для получения вероятностей
    probs = probabilities.squeeze().cpu().numpy().tolist()
    print(probabilities)
    print(probs)
    
    # Ищем класс с максимальной вероятностью для растений и болезней
    plant_indices = [i for i, name in enumerate(class_names) if name in plants]
    disease_indices = [i for i, name in enumerate(class_names) if name in diseases]

    #y_plant_index = plant_indices[np.argmax([probs[i] for i in plant_indices])]
    y_plant_index = probs.index(max(probs[i] for i in plant_indices))
    print(probs[y_plant_index],y_plant_index,class_names[y_plant_index])
    y_disease_index = probs.index(max(probs[i] for i in disease_indices))
    print(probs[y_disease_index],y_disease_index,class_names[y_disease_index])
    
    y_plant = probs[y_plant_index]
    y_plant_name = class_names[y_plant_index]
    y_disease = probs[y_disease_index]
    y_disease_name = class_names[y_disease_index]
    
    # Создаем бинарные метки
    preds = [1 if class_names[i] in (y_plant_name, y_disease_name) else 0 for i in range(len(class_names))]
    #print(preds,len(preds))
    pred_labels = np.array(preds, dtype=np.int32)
    #pred_labels = np.array(preds, dtype=int)
    #preds = (probabilities > 0.3).int()

# Вывод предсказаний
#print(true_labels.shape)  # Должно быть (N,)
#print(pred_labels.shape)  # Должно быть (N,)
print(f"Предсказанные метки: {pred_labels}")
print(f"Истинные метки: {true_labels}")
print(type(true_labels))
print(type(pred_labels))


#pred_labels=pred_labels.cpu().numpy()


# Вычисляем classification_report
report = classification_report(true_labels.reshape(1, -1), pred_labels.reshape(1, -1), target_names=class_names, zero_division=0)
print(report)


tensor([[0.2390, 0.0065, 0.0068, 0.0188, 0.0308, 0.0110, 0.0393, 0.0149, 0.8511,
         0.0544, 0.5316, 0.0301, 0.0121, 0.0568, 0.0247, 0.0030, 0.0084, 0.2296,
         0.0120, 0.0275, 0.0522, 0.0099, 0.0126, 0.8358, 0.0103, 0.1169, 0.0077]],
       device='cuda:0')
[0.23897939920425415, 0.006475712638348341, 0.00676361657679081, 0.01875654235482216, 0.030769137665629387, 0.010952968150377274, 0.03931235522031784, 0.014929354190826416, 0.8510856032371521, 0.05440158769488335, 0.5316214561462402, 0.030128121376037598, 0.012142627499997616, 0.056843627244234085, 0.024680299684405327, 0.0030464024748653173, 0.008350137621164322, 0.2296139895915985, 0.012044651433825493, 0.027455179020762444, 0.05219947174191475, 0.0098974434658885, 0.01255970261991024, 0.835801362991333, 0.010342643596231937, 0.11691523343324661, 0.007706086151301861]
0.8510856032371521 8 Squash
0.835801362991333 23 Powdery mildew
Предсказанные метки: [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
Истинные метк

In [116]:
def get_predictions(model, test_loader, device):
    model.eval()  # Переключение модели в режим оценки
    all_preds = []  # Для хранения предсказанных меток
    all_probs = []  # Для хранения вероятностей
    all_labels = []  # Для хранения истинных меток
    plant_indices = [i for i, name in enumerate(class_names) if name in plants]
    disease_indices = [i for i, name in enumerate(class_names) if name in diseases]

    with torch.no_grad():  # Отключаем вычисление градиентов
        for inputs, labels in test_loader:
            inputs = inputs.to(device)  # Перемещаем данные на устройство
            labels = labels.to(device)
            #print(inputs, labels)
            # Получаем выход модели (логиты)
            outputs = model(inputs)

            # Применяем sigmoid для получения вероятностей (для multilabel классификации)
            probabilities = torch.sigmoid(outputs)
            #print(probabilities)

            probs = probabilities.squeeze().cpu().numpy().tolist()
            
            for prob in probs:
                print(len(prob))
                y_plant_index = prob.index(max(prob[i] for i in plant_indices))
                #print(probs[y_plant_index],y_plant_index,class_names[y_plant_index])
                y_disease_index = prob.index(max(prob[i] for i in disease_indices))
                #print(probs[y_disease_index],y_disease_index,class_names[y_disease_index])
                
                y_plant = prob[y_plant_index]
                y_plant_name = class_names[y_plant_index]
                y_disease = prob[y_disease_index]
                y_disease_name = class_names[y_disease_index]
                
                # Создаем бинарные метки
                preds = [1 if class_names[i] in (y_plant_name, y_disease_name) else 0 for i in range(len(class_names))]
                #print(preds,len(preds))
                pred_labels = np.array(preds, dtype=np.int32)

                # Сохраняем результаты
                all_preds.append(pred_labels)  # Перемещаем на CPU и преобразуем в NumPy
                
            all_probs.extend(probabilities.cpu().numpy())  # Сохраняем вероятности
            all_labels.extend(labels.cpu().numpy())  # Сохраняем истинные метки

    return all_preds, all_probs, all_labels

In [117]:
preds, probs, true_labels = get_predictions(model, test_loader, device)

#print("Размер true_labels:", true_labels.shape)
#print("Размер pred_labels:", pred_labels.shape)

# Выводим предсказания и истинные метки
print("Предсказанные метки:", preds)
print("Истинные метки:", true_labels)

# Вычисляем classification_report (если есть истинные метки)
if true_labels:
    report = classification_report(true_labels, preds, target_names=class_names, zero_division=0)
    print(report)

27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
27
2

In [118]:
results_df = pd.DataFrame({
    "true_labels": [", ".join([class_names[i] for i, val in enumerate(labels) if val == 1]) for labels in true_labels],  # Истинные метки
    "pred_labels": [", ".join([class_names[i] for i, val in enumerate(labels) if val == 1]) for labels in preds],  # Предсказанные метки
})

classes_df = pd.DataFrame( probs, columns=all_classes)

results_df = pd.concat([results_df, classes_df], axis=1)

# Сохраняем DataFrame в CSV-файл
results_df.to_csv("true_vs_pred_labels_04.csv", index=False)

In [243]:
results_df=[]

for i, class_name in enumerate(class_names):
    results_df[f"prob_{class_name}"] = [probs[i] for probs in true_labels]

# Сохраняем обновленный DataFrame
results_df.to_csv("true_labels.csv", index=False)

TypeError: list indices must be integers or slices, not str

In [244]:
results_df=[]

for i, class_name in enumerate(class_names):
    for i in preds:
        results_df[f"prob_{class_name}"] = [probs[i] for probs in preds]
        results_df

# Сохраняем обновленный DataFrame
results_df.to_csv("pred_labels.csv", index=False)

TypeError: list indices must be integers or slices, not str