Challenge sur la Fraude¶
Chargements des bibliothèques et des données¶
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
import os
os.environ["LOKY_MAX_CPU_COUNT"] = "6" # Définition d'un nombre de process pour éviter les warning
os.listdir() #listing des fichiers pour chargements
Out[2]:
['.ipynb_checkpoints', 'catboost_info', 'cat_fraud.xlsx', 'Challenge_Fraude - Copie.ipynb', 'Challenge_Fraude-Copy1.ipynb', 'Challenge_Fraude.ipynb', 'challengfe submission.csv', 'df_feature.csv', 'fraude.pdf', 'fraude.pptx', 'submission.csv', 'Untitled.ipynb', 'X_test_8skS2ey.csv', 'X_train_G3tdtEn.csv', 'Y_test_random_2.csv', 'Y_train_2_XPXJDyy.csv', '~$fraud.pptx', '~$fraude.pptx']
In [3]:
#Paramètre permettant de consulter l'ensemble des colonnes et adapter le contenu des colonnes sans tronquer
pd.set_option("display.max_colwidth",None)
pd.set_option('display.max_columns', None)
In [4]:
#Chargement des jeux d'entrainement X_train, X_test
X_train=pd.read_csv("X_train_G3tdtEn.csv", low_memory=False)#Ajustement mémoire
y_train=pd.read_csv("Y_train_2_XPXJDyy.csv", index_col=0)#Suppression colonnes lors de l'intégration
In [5]:
#Chargement du jeu de test
X_test=pd.read_csv("X_test_8skS2ey.csv", low_memory=False)
Analyse exploratoire des données¶
In [6]:
print(f"Composition des jeux de données pour X_train de {X_train.shape[0]} lignes et de {X_train.shape[1]} colonnes")
print(f"Composition des jeux de données pour y_train de {y_train.shape[0]} lignes et de {y_train.shape[1]} colonnes")
Composition des jeux de données pour X_train de 92790 lignes et de 146 colonnes Composition des jeux de données pour y_train de 92790 lignes et de 2 colonnes
In [7]:
print(f"Composition des jeux de données pour X_test de {X_test.shape[0]} lignes et de {X_test.shape[1]} colonnes")
Composition des jeux de données pour X_test de 23198 lignes et de 146 colonnes
In [8]:
X_train.head(3)
Out[8]:
| ID | item1 | item2 | item3 | item4 | item5 | item6 | item7 | item8 | item9 | item10 | item11 | item12 | item13 | item14 | item15 | item16 | item17 | item18 | item19 | item20 | item21 | item22 | item23 | item24 | cash_price1 | cash_price2 | cash_price3 | cash_price4 | cash_price5 | cash_price6 | cash_price7 | cash_price8 | cash_price9 | cash_price10 | cash_price11 | cash_price12 | cash_price13 | cash_price14 | cash_price15 | cash_price16 | cash_price17 | cash_price18 | cash_price19 | cash_price20 | cash_price21 | cash_price22 | cash_price23 | cash_price24 | make1 | make2 | make3 | make4 | make5 | make6 | make7 | make8 | make9 | make10 | make11 | make12 | make13 | make14 | make15 | make16 | make17 | make18 | make19 | make20 | make21 | make22 | make23 | make24 | model1 | model2 | model3 | model4 | model5 | model6 | model7 | model8 | model9 | model10 | model11 | model12 | model13 | model14 | model15 | model16 | model17 | model18 | model19 | model20 | model21 | model22 | model23 | model24 | goods_code1 | goods_code2 | goods_code3 | goods_code4 | goods_code5 | goods_code6 | goods_code7 | goods_code8 | goods_code9 | goods_code10 | goods_code11 | goods_code12 | goods_code13 | goods_code14 | goods_code15 | goods_code16 | goods_code17 | goods_code18 | goods_code19 | goods_code20 | goods_code21 | goods_code22 | goods_code23 | goods_code24 | Nbr_of_prod_purchas1 | Nbr_of_prod_purchas2 | Nbr_of_prod_purchas3 | Nbr_of_prod_purchas4 | Nbr_of_prod_purchas5 | Nbr_of_prod_purchas6 | Nbr_of_prod_purchas7 | Nbr_of_prod_purchas8 | Nbr_of_prod_purchas9 | Nbr_of_prod_purchas10 | Nbr_of_prod_purchas11 | Nbr_of_prod_purchas12 | Nbr_of_prod_purchas13 | Nbr_of_prod_purchas14 | Nbr_of_prod_purchas15 | Nbr_of_prod_purchas16 | Nbr_of_prod_purchas17 | Nbr_of_prod_purchas18 | Nbr_of_prod_purchas19 | Nbr_of_prod_purchas20 | Nbr_of_prod_purchas21 | Nbr_of_prod_purchas22 | Nbr_of_prod_purchas23 | Nbr_of_prod_purchas24 | Nb_of_items | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 85517 | COMPUTERS | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 889.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | APPLE | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2020 APPLE MACBOOK AIR 13 3 RETINA DISPLAY M1 PROC | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 239246776 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 |
| 1 | 51113 | COMPUTER PERIPHERALS ACCESSORIES | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 409.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | APPLE | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | APPLE WATCH SERIES 6 GPS 44MM SPACE GREY ALUMINIUM | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 239001518 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 |
| 2 | 83008 | TELEVISIONS HOME CINEMA | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1399.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | SAMSUNG | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | SAMSUNG QE75Q70A 2021 QLED HDR 4K ULTRA HD SMART T | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 239842093 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 |
In [9]:
X_train.dtypes
Out[9]:
ID int64
item1 str
item2 str
item3 str
item4 str
...
Nbr_of_prod_purchas21 float64
Nbr_of_prod_purchas22 float64
Nbr_of_prod_purchas23 float64
Nbr_of_prod_purchas24 float64
Nb_of_items float64
Length: 146, dtype: object
In [10]:
X_train.iloc[:,5].value_counts()
Out[10]:
item5
FULFILMENT CHARGE 388
BED LINEN 220
LIVING DINING FURNITURE 166
BEDROOM FURNITURE 152
LIGHTING 106
...
SCHOOLWEAR 1
PARTY DECORATIONS 1
LAUNDRY & CLOTHESCARE 1
FITTED KITCHENS 1
WOMENS FOOTWEAR 1
Name: count, Length: 107, dtype: int64
In [11]:
X_train.isnull().sum()
Out[11]:
ID 0
item1 0
item2 48134
item3 79889
item4 88228
...
Nbr_of_prod_purchas21 92713
Nbr_of_prod_purchas22 92727
Nbr_of_prod_purchas23 92740
Nbr_of_prod_purchas24 92747
Nb_of_items 0
Length: 146, dtype: int64
In [12]:
y_train.columns
Out[12]:
Index(['ID', 'fraud_flag'], dtype='str')
In [13]:
y_train.iloc[:,1].value_counts()
Out[13]:
fraud_flag 0 91471 1 1319 Name: count, dtype: int64
In [14]:
y_train.iloc[:,1].value_counts(normalize=True)*100
Out[14]:
fraud_flag 0 98.578511 1 1.421489 Name: proportion, dtype: float64
In [15]:
y_train.head()
Out[15]:
| ID | fraud_flag | |
|---|---|---|
| index | ||
| 0 | 85517 | 0 |
| 1 | 51113 | 0 |
| 2 | 83008 | 0 |
| 3 | 78712 | 0 |
| 4 | 77846 | 0 |
In [16]:
#Vérification de concordance des ID entre les jeux d'entrainement
(X_train["ID"]==y_train["ID"]).all()
Out[16]:
True
In [17]:
#Merge des dataframe X_train et y_train) sur l'ID.
df_train=pd.merge(X_train,y_train, on="ID")
Description des variables en entrée (X) du challenge¶
| Variable | Description | Exemple |
|---|---|---|
ID (Num) |Identifiant unique| 1
item1 à item24 (Char) |Catégorie du bien de l'item 1 à 24| Computer
cash_price1 à cash_price24 (Num) |Prix de l'item 1 à 24 |850
make1 à make24 (Char) |Fabriquant de l'item 1 à 24 |Apple
model1 à model24 (Char) |Description du modèle de l'item 1 à 24 |Apple Iphone XX
goods_code1 à goods_code24 (Char) |Code de l'item 1 à 24 |2378284364
Nbr_of_prod_purchas1 à Nbr_of_prod_purchas24 (Num) |Nombre de produits dans l'item 1 à 24 |2
Nb_of_items (Num) |Nombre total d'items |7
In [18]:
df_train.groupby('fraud_flag')['item2'].value_counts().unstack(fill_value=0)
Out[18]:
| item2 | 6 SPACE GREY 32GB | AERIALS REMOTE CONTROLS | AUDIO ACCESSORIES | BABY & CHILD TRAVEL | BABY CHANGING | BABY CHILD TRAVEL | BABY FEEDING | BABY PLAY EQUIPMENT | BABYWEAR | BAGS & CARRY CASES | BAGS CARRY CASES | BAGS WALLETS ACCESSORIES | BAGS, WALLETS & ACCESSORIES | BARBECUES & ACCESSORIES | BARBECUES ACCESSORIES | BARWARE | BATH & BODYCARE | BATH BODYCARE | BATH LINEN | BATHROOM ACCESSORIES | BATHROOM FIXTURES | BED LINEN | BEDROOM FURNITURE | BLANK MEDIA MEDIA STORAGE | BOYSWEAR | CABLES & ADAPTERS | CABLES ADAPTERS | CARPETS RUGS FLOORING | CARPETS, RUGS & FLOORING | CHILDREN S ACCESSORIES | CHILDREN S FOOTWEAR | CHILDREN S FURNITURE | CHRISTMAS DECORATIONS | COMPUTER NETWORKING | COMPUTER PERIPHERALS & ACCESSORIES | COMPUTER PERIPHERALS ACCESSORIES | COMPUTER SOFTWARE | COMPUTERS | COOKING APPLIANCES | COOKWARE | CRAFT | DECORATING | DECORATIVE ACCESSORIES | DISPOSABLE TABLEWARE CUTLERY | DOOR FURNITURE | DRESSMAKING | FACIAL SKINCARE | FILING DESK ACCESSORIES | FITTED KITCHENS | FOOD PREPARATION | FOOD STORAGE | FRAGRANCE | FULFILMENT CHARGE | GAMES | GAMING | GARDENING EQUIPMENT | GIFT FOOD DRINK | GIFT WRAP | GIRLSWEAR | GREETING CARDS & PERSONALISED STATIONERY | GREETING CARDS PERSONALISED STATIONERY | GYM EQUIPMENT | HEALTH BEAUTY ACCESSORIES | HEALTH BEAUTY ELECTRICAL | HEATING & COOLING APPLIANCES | HEATING COOLING APPLIANCES | HI-FI | HOME AND PERSONAL SECURITY | HOME OFFICE | HOT DRINK PREPARATION | HOUSEHOLD CLEANING | HP ELITEBOOK 850V6 | IMAGING ACCESSORIES | IMAGING EQUIPMENT | JEWELLERY WATCHES | KITCHEN ACCESSORIES | KITCHEN SCALES MEASURES | KITCHEN STORAGE | KITCHEN UTENSILS & GADGETS | KITCHEN UTENSILS GADGETS | LAUNDRY & CLOTHESCARE | LAUNDRY CLOTHESCARE | LIGHTING | LINGERIE HOISERY | LIVING & DINING FURNITURE | LIVING DINING FURNITURE | LUGGAGE | MAKEUP | MEN S ACCESSORIES | MEN S CLOTHES | MEN S FOOTWEAR | MEN S NIGHTWEAR | MEN S UNDERWEAR SOCKS | MENS CLOTHES | NURSERY ACCESSORIES | NURSERY FURNITURE | NURSERY LINEN | NURSERY TOYS | OUTDOOR ACCESSORIES | OUTDOOR FURNITURE | PARTY DECORATIONS | PICNICWARE | POWER & BATTERIES | POWER BATTERIES | PRESERVING BAKING EQUIPMENT | PRINTERS & SCANNERS | PRINTERS SCANNERS | PRODUCT | SCHOOLWEAR | SERVICE | SOFT FURNISHINGS | SPORTS EQUIPMENT | STANDS & BRACKETS | STANDS BRACKETS | STATIONERY SUNDRIES | STORAGE & ORGANISATION | STORAGE ORGANISATION | SUNCARE | SUNGLASSES & READING GLASSES | SUNGLASSES READING GLASSES | TABLE LINEN | TABLEWARE | TELEPHONE ACCESSORIES | TELEPHONES FAX MACHINES TWO-WAY RADIOS | TELEPHONES, FAX MACHINES & TWO-WAY RADIOS | TELEVISIONS & HOME CINEMA | TELEVISIONS HOME CINEMA | TOYS | VIDEOS DVD DIGITAL EQUIPMENT | WARRANTY | WINDOW DRESSING | WOMEN S ACCESSORIES | WOMEN S CLOTHES | WOMEN S FOOTWEAR | WOMEN S NIGHTWEAR | WOMENS ACCESSORIES | WOMENS CLOTHES |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| fraud_flag | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 0 | 2 | 1 | 831 | 44 | 27 | 212 | 53 | 9 | 24 | 59 | 527 | 10 | 2 | 28 | 56 | 9 | 1 | 1 | 26 | 19 | 11 | 394 | 1194 | 7 | 11 | 62 | 800 | 70 | 24 | 3 | 4 | 1 | 5 | 2 | 466 | 3810 | 221 | 2134 | 9 | 44 | 8 | 9 | 102 | 1 | 2 | 6 | 10 | 1 | 1 | 26 | 2 | 14 | 17562 | 2 | 11 | 2 | 6 | 2 | 7 | 1 | 4 | 1 | 1 | 23 | 2 | 6 | 37 | 14 | 95 | 10 | 53 | 6 | 3 | 2 | 13 | 4 | 2 | 5 | 1 | 11 | 3 | 13 | 213 | 3 | 428 | 1807 | 2 | 12 | 1 | 12 | 3 | 3 | 5 | 5 | 11 | 39 | 2 | 2 | 43 | 403 | 2 | 2 | 68 | 184 | 3 | 2 | 16 | 1 | 4 | 2472 | 108 | 1 | 8 | 66 | 2 | 4 | 18 | 1 | 4 | 2 | 1 | 58 | 204 | 74 | 153 | 30 | 458 | 8 | 11 | 7616 | 34 | 1 | 23 | 8 | 5 | 1 | 4 |
| 1 | 0 | 0 | 69 | 0 | 0 | 5 | 1 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | 0 | 80 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 453 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 7 | 2 | 0 | 11 | 0 | 0 | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Mise en place des variables¶
In [19]:
#Regroupement par typologie pour construction des features
item_cols = [f"item{i}" for i in range (1,25)]
price_cols = [f"cash_price{i}"for i in range (1,25)]
make_cols = [f"make{i}"for i in range (1,25)]
model_cols = [f"model{i}"for i in range (1,25)]
goods_cols = [f"goods_code{i}"for i in range (1,25)]
qty_cols = [f"Nbr_of_prod_purchas{i}"for i in range (1,25)]
Identification des items les plus fraudés ?¶
In [20]:
df_items = df_train.melt(
id_vars=["ID", "fraud_flag"],
value_vars=item_cols,
var_name="item_pos",
value_name="categorie"
).dropna(subset=["categorie"])
df_items.head(5)
Out[20]:
| ID | fraud_flag | item_pos | categorie | |
|---|---|---|---|---|
| 0 | 85517 | 0 | item1 | COMPUTERS |
| 1 | 51113 | 0 | item1 | COMPUTER PERIPHERALS ACCESSORIES |
| 2 | 83008 | 0 | item1 | TELEVISIONS HOME CINEMA |
| 3 | 78712 | 0 | item1 | COMPUTERS |
| 4 | 77846 | 0 | item1 | TELEVISIONS HOME CINEMA |
In [21]:
fraud_by_cat = (
df_items
.groupby("categorie")["fraud_flag"]
.agg(["mean", "count"])
.rename(columns={"mean": "taux_fraude", "count": "nb_transactions"})
.sort_values("taux_fraude", ascending=False)
)
print(fraud_by_cat.head(10))
fraud_by_cat.head(40).to_excel("cat_fraud.xlsx", index=True)
taux_fraude nb_transactions categorie KITCHEN SCALES & MEASURES 1.000000 2 KITCHEN UTENSILS & GADGETS 0.354839 31 STORAGE & ORGANISATION 0.205128 39 LAUNDRY & CLOTHESCARE 0.147059 34 LUGGAGE 0.133333 15 PRESERVING & BAKING EQUIPMENT 0.111111 9 IMAGING EQUIPMENT 0.082353 85 MEN S CLOTHES 0.067797 59 BLANK MEDIA MEDIA STORAGE 0.058824 17 GAMING 0.055556 36
In [22]:
#Taux de fraude globale
global_rate = df_train["fraud_flag"].mean()
#Identification si catégorie risqué ou pas : Supérieur à la moyenne de fraude et + de 100 transactions
fraud_by_cat["is_risky"] = (
(fraud_by_cat["taux_fraude"] > global_rate) &
(fraud_by_cat["nb_transactions"] > 100)
)
In [23]:
#Création de la liste des catégories risquées..
high_risk_categories = fraud_by_cat[
fraud_by_cat["is_risky"]
].index.tolist()
print(high_risk_categories)
['AUDIO ACCESSORIES', 'NURSERY ACCESSORIES', 'HI-FI', 'TABLEWARE', 'TELEPHONES, FAX MACHINES & TWO-WAY RADIOS', 'BABY CHILD TRAVEL', 'TELEPHONES FAX MACHINES TWO-WAY RADIOS', 'DECORATIVE ACCESSORIES', 'COMPUTERS', 'FULFILMENT CHARGE', 'HOME AND PERSONAL SECURITY', 'SOFT FURNISHINGS', 'BABYWEAR', 'BATH LINEN', 'POWER BATTERIES', 'BATHROOM ACCESSORIES', 'BABY CHANGING']
In [24]:
#Création d'une feature avec la notion de catégorie à risque 1 ou 0
df_train["high_risk_item"] = (
df_train[item_cols]
.isin(high_risk_categories)
.any(axis=1)
.astype(int)
)
In [25]:
df_train.head(3)
Out[25]:
| ID | item1 | item2 | item3 | item4 | item5 | item6 | item7 | item8 | item9 | item10 | item11 | item12 | item13 | item14 | item15 | item16 | item17 | item18 | item19 | item20 | item21 | item22 | item23 | item24 | cash_price1 | cash_price2 | cash_price3 | cash_price4 | cash_price5 | cash_price6 | cash_price7 | cash_price8 | cash_price9 | cash_price10 | cash_price11 | cash_price12 | cash_price13 | cash_price14 | cash_price15 | cash_price16 | cash_price17 | cash_price18 | cash_price19 | cash_price20 | cash_price21 | cash_price22 | cash_price23 | cash_price24 | make1 | make2 | make3 | make4 | make5 | make6 | make7 | make8 | make9 | make10 | make11 | make12 | make13 | make14 | make15 | make16 | make17 | make18 | make19 | make20 | make21 | make22 | make23 | make24 | model1 | model2 | model3 | model4 | model5 | model6 | model7 | model8 | model9 | model10 | model11 | model12 | model13 | model14 | model15 | model16 | model17 | model18 | model19 | model20 | model21 | model22 | model23 | model24 | goods_code1 | goods_code2 | goods_code3 | goods_code4 | goods_code5 | goods_code6 | goods_code7 | goods_code8 | goods_code9 | goods_code10 | goods_code11 | goods_code12 | goods_code13 | goods_code14 | goods_code15 | goods_code16 | goods_code17 | goods_code18 | goods_code19 | goods_code20 | goods_code21 | goods_code22 | goods_code23 | goods_code24 | Nbr_of_prod_purchas1 | Nbr_of_prod_purchas2 | Nbr_of_prod_purchas3 | Nbr_of_prod_purchas4 | Nbr_of_prod_purchas5 | Nbr_of_prod_purchas6 | Nbr_of_prod_purchas7 | Nbr_of_prod_purchas8 | Nbr_of_prod_purchas9 | Nbr_of_prod_purchas10 | Nbr_of_prod_purchas11 | Nbr_of_prod_purchas12 | Nbr_of_prod_purchas13 | Nbr_of_prod_purchas14 | Nbr_of_prod_purchas15 | Nbr_of_prod_purchas16 | Nbr_of_prod_purchas17 | Nbr_of_prod_purchas18 | Nbr_of_prod_purchas19 | Nbr_of_prod_purchas20 | Nbr_of_prod_purchas21 | Nbr_of_prod_purchas22 | Nbr_of_prod_purchas23 | Nbr_of_prod_purchas24 | Nb_of_items | fraud_flag | high_risk_item | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 85517 | COMPUTERS | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 889.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | APPLE | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2020 APPLE MACBOOK AIR 13 3 RETINA DISPLAY M1 PROC | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 239246776 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 0 | 1 |
| 1 | 51113 | COMPUTER PERIPHERALS ACCESSORIES | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 409.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | APPLE | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | APPLE WATCH SERIES 6 GPS 44MM SPACE GREY ALUMINIUM | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 239001518 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 0 | 0 |
| 2 | 83008 | TELEVISIONS HOME CINEMA | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1399.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | SAMSUNG | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | SAMSUNG QE75Q70A 2021 QLED HDR 4K ULTRA HD SMART T | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 239842093 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 0 | 0 |
1 - Structure du panier¶
In [26]:
#1.1 - Calcule du nombre items réel présent par ligne
df_train['nb_items_real'] = df_train[item_cols].notna().sum(axis=1)
In [27]:
#1.2 - Identification du nombre d'items uniques par ligne
df_train["nb_items_unique"] = df_train[item_cols].apply(lambda row: row.dropna().nunique(), axis = 1)
In [28]:
#1.3 - Taux de répétion (exclusion des divisions par 0 ou sans valeur)
df_train["ratio_repetition"] = round((df_train["nb_items_unique"] / df_train['nb_items_real']).replace([np.inf, np.nan], 0),2)
2 - Le prix¶
In [29]:
#2.1 - Montant total du Panier par ligne
df_train["Montant_total"] = df_train[price_cols].sum(axis=1)
In [30]:
#2.2 - Prix maximum
df_train["Prix_max"] = df_train[price_cols].max(axis=1)
In [31]:
#2.3 - Prix Moyen
df_train["Prix_moyen"] = df_train[price_cols].mean(axis=1)
In [32]:
#2.4 - Dispersion des prix(Ecart Type)
df_train["Prix_std"] = df_train[price_cols].std(axis=1)
In [33]:
#2.5 - Ratio Prix max / Prix total
df_train["Ratio_prix_max"] = round(df_train["Prix_max"]/df_train["Montant_total"].replace([np.inf, np.nan], 0),2)
In [34]:
#2.6 - Prix minimum
df_train["Prix_min"] = df_train[price_cols].min(axis=1)
3 - Les quantités¶
In [35]:
# 3.1 - Quantités totales
df_train["quantite_total"] = df_train[qty_cols].sum(axis=1)
In [36]:
# 3.2 - Quantités totales
df_train["quantite_moyenne"] = df_train[qty_cols].mean(axis=1)
In [37]:
# 3.3 - Quantités Max
df_train["quantite_max"] = df_train[qty_cols].max(axis=1)
4 - Diversité¶
In [38]:
# 4.1 - Diversité des catégories
df_train["nb_goods_unique"] = df_train[goods_cols].nunique(axis=1)
In [39]:
# 4.2 - Diversité des marques (nombre de marque unique)
df_train["nb_make_unique"] = df_train[make_cols].nunique(axis=1)
In [40]:
# 4.3 - Diversité des modèles
df_train["nb_model_unique"] = df_train[model_cols].nunique(axis=1)
In [41]:
#4.4 - Dominance d'une marque
df_train["dominant_make_ratio"] = df_train[make_cols].apply(lambda row: row.value_counts(normalize=True).max(), axis=1)
Nettoyage du dataframe pour création de X et y¶
In [42]:
#Sauvegarde du nom des colonnes
X_col=df_train.iloc[:,147:].columns
In [43]:
#Création de X en ne conservant que les nouvelles features
X=df_train.iloc[:,147:]
In [44]:
#Création de y
y=df_train.iloc[:,146:147]
In [45]:
df_train.iloc[:,147:].hist(bins=50, figsize=(12,8))
plt.tight_layout()
plt.show()
In [46]:
X.describe()
Out[46]:
| high_risk_item | nb_items_real | nb_items_unique | ratio_repetition | Montant_total | Prix_max | Prix_moyen | Prix_std | Ratio_prix_max | Prix_min | quantite_total | quantite_moyenne | quantite_max | nb_goods_unique | nb_make_unique | nb_model_unique | dominant_make_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 92790.000000 | 92790.000000 | 92790.000000 | 92790.000000 | 92790.000000 | 92790.000000 | 92790.000000 | 44656.000000 | 92790.000000 | 92790.000000 | 92790.000000 | 92790.000000 | 92790.000000 | 92790.000000 | 92790.000000 | 92790.000000 | 92105.000000 |
| mean | 0.689719 | 1.760502 | 1.595862 | 0.961312 | 1234.759554 | 1140.466494 | 863.112562 | 691.606661 | 0.942662 | 628.109074 | 1.853864 | 1.029176 | 1.062259 | 1.758422 | 1.417567 | 1.637192 | 0.823455 |
| std | 0.462611 | 1.379435 | 0.852246 | 0.130476 | 771.484197 | 693.602559 | 640.697435 | 451.178318 | 0.129666 | 754.869218 | 1.981476 | 0.281946 | 0.493230 | 1.375367 | 0.700295 | 1.200480 | 0.232601 |
| min | 0.000000 | 1.000000 | 1.000000 | 0.080000 | 219.000000 | 79.000000 | 20.750000 | 0.000000 | 0.100000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.100000 |
| 25% | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 779.000000 | 719.000000 | 424.666667 | 383.251875 | 0.960000 | 11.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.500000 |
| 50% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1099.000000 | 999.000000 | 703.833333 | 622.253967 | 1.000000 | 379.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 75% | 1.000000 | 2.000000 | 2.000000 | 1.000000 | 1499.000000 | 1379.000000 | 1117.000000 | 834.386002 | 1.000000 | 1049.000000 | 2.000000 | 1.000000 | 1.000000 | 2.000000 | 2.000000 | 2.000000 | 1.000000 |
| max | 1.000000 | 24.000000 | 16.000000 | 1.000000 | 21995.000000 | 21995.000000 | 21995.000000 | 6503.968173 | 1.000000 | 21995.000000 | 98.000000 | 40.000000 | 40.000000 | 24.000000 | 18.000000 | 24.000000 | 1.000000 |
In [47]:
y.head(3)
Out[47]:
| fraud_flag | |
|---|---|
| 0 | 0 |
| 1 | 0 |
| 2 | 0 |
Création d'un jeu d'entrainement et de test¶
In [63]:
# importation des bibliothèques scikitlearn pour le Machine Learning
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.metrics import precision_recall_curve, average_precision_score,confusion_matrix
from sklearn.model_selection import StratifiedKFold
In [49]:
#Création jeu de test et d'entrainement avec répartition égale des cas de fraude (option = stratify)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,
random_state=42
)
In [50]:
X_train.describe()
Out[50]:
| high_risk_item | nb_items_real | nb_items_unique | ratio_repetition | Montant_total | Prix_max | Prix_moyen | Prix_std | Ratio_prix_max | Prix_min | quantite_total | quantite_moyenne | quantite_max | nb_goods_unique | nb_make_unique | nb_model_unique | dominant_make_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 74232.000000 | 74232.000000 | 74232.000000 | 74232.000000 | 74232.000000 | 74232.000000 | 74232.000000 | 35797.000000 | 74232.000000 | 74232.000000 | 74232.000000 | 74232.000000 | 74232.000000 | 74232.000000 | 74232.000000 | 74232.000000 | 73705.000000 |
| mean | 0.689500 | 1.762488 | 1.596239 | 0.960904 | 1236.022740 | 1141.176837 | 862.769164 | 692.628817 | 0.942473 | 626.950062 | 1.855803 | 1.028864 | 1.062480 | 1.760440 | 1.417259 | 1.639131 | 0.823687 |
| std | 0.462701 | 1.380614 | 0.850195 | 0.131145 | 770.902837 | 692.271420 | 638.451629 | 451.369632 | 0.129704 | 752.975482 | 1.973981 | 0.276541 | 0.491748 | 1.376498 | 0.698966 | 1.202884 | 0.232307 |
| min | 0.000000 | 1.000000 | 1.000000 | 0.080000 | 219.000000 | 79.000000 | 20.750000 | 0.000000 | 0.100000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.100000 |
| 25% | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 779.000000 | 719.000000 | 425.383333 | 383.251875 | 0.960000 | 11.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.500000 |
| 50% | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1099.000000 | 999.000000 | 704.583333 | 622.253967 | 1.000000 | 379.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 75% | 1.000000 | 2.000000 | 2.000000 | 1.000000 | 1499.000000 | 1379.000000 | 1117.000000 | 834.386002 | 1.000000 | 1045.000000 | 2.000000 | 1.000000 | 1.000000 | 2.000000 | 2.000000 | 2.000000 | 1.000000 |
| max | 1.000000 | 24.000000 | 16.000000 | 1.000000 | 21995.000000 | 21995.000000 | 21995.000000 | 6503.968173 | 1.000000 | 21995.000000 | 98.000000 | 40.000000 | 40.000000 | 24.000000 | 18.000000 | 24.000000 | 1.000000 |
In [51]:
#Equilibrage Non-fraude / Fraude pour le modèle
scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1]
In [52]:
scale_pos_weight
Out[52]:
69.36208530805688
In [53]:
# Passage sur 1 array des valeurs y
y_train_1d = y_train.squeeze()
y_test_1d = y_test.squeeze()
In [54]:
#Définition des params du modèles
params = {
"objective": "binary",
"n_estimators": 2500,
"learning_rate": 0.01,
"num_leaves": 127,
"min_data_in_leaf": 150,
"feature_fraction": 0.7,
"bagging_fraction": 0.7,
"bagging_freq": 1,
"scale_pos_weight": scale_pos_weight,
"random_state": 42,
"verbose":-1
}
model = lgb.LGBMClassifier(**params)
model.fit(X_train, y_train_1d)
#Calcule des proba
y_proba = model.predict_proba(X_test)[:,1]
#Affichage du PR AUC
print("PR AUC :", average_precision_score(y_test_1d, y_proba))
PR AUC : 0.1816315058691499
In [55]:
# Calcul des points de la courbe
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
# PR-AUC
pr_auc = average_precision_score(y_test, y_proba)
print("PR-AUC :", pr_auc)
# Affichage du PR AUC
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f"PR curve (AUC = {pr_auc:.3f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.grid(True)
plt.show()
PR-AUC : 0.1816315058691499
In [56]:
#Contrôle des Fraudes sur le modèles
arr = np.round(model.predict_proba(X_test)[:, 1], 0)
values, counts = np.unique(arr, return_counts=True)
print(values)
print(counts)
[0. 1.] [14546 4012]
In [57]:
#Vérification si le modèle n'overfit pas entre le train et le test
pr_train = average_precision_score(y_train_1d, model.predict_proba(X_train.values)[:,1])
pr_test = average_precision_score(y_test_1d, model.predict_proba(X_test.values)[:,1])
print("PR-AUC train :", pr_train)
print("PR-AUC test :", pr_test)
C:\Users\ramat\anaconda3\Lib\site-packages\sklearn\utils\validation.py:2691: UserWarning: X does not have valid feature names, but LGBMClassifier was fitted with feature names warnings.warn( C:\Users\ramat\anaconda3\Lib\site-packages\sklearn\utils\validation.py:2691: UserWarning: X does not have valid feature names, but LGBMClassifier was fitted with feature names warnings.warn(
PR-AUC train : 0.36714294157234967 PR-AUC test : 0.1816315058691499
In [58]:
# Réinitiation des index pour X et y pour le K-Fold
X = df_train.iloc[:, 147:].reset_index(drop=True)
y = df_train.iloc[:, 146].reset_index(drop=True)
# Initialisation du K-fold stratifié
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
print(f"\n--- Fold {fold+1} ---")
X_train_fold = X.iloc[train_idx]
y_train_fold = y.iloc[train_idx]
X_valid_fold = X.iloc[valid_idx]
y_valid_fold = y.iloc[valid_idx]
model = lgb.LGBMClassifier(**params)
model.fit(X_train_fold, y_train_fold)
y_pred = model.predict_proba(X_valid_fold)[:, 1]
pr_auc = average_precision_score(y_valid_fold, y_pred)
scores.append(pr_auc)
print(f"PR-AUC fold {fold+1}: {pr_auc:.4f}")
print("\nScores PR-AUC :", scores)
print("Moyenne :", np.mean(scores))
print("Écart-type :", np.std(scores))
--- Fold 1 --- PR-AUC fold 1: 0.1700 --- Fold 2 --- PR-AUC fold 2: 0.1473 --- Fold 3 --- PR-AUC fold 3: 0.1630 --- Fold 4 --- PR-AUC fold 4: 0.1753 --- Fold 5 --- PR-AUC fold 5: 0.1342 Scores PR-AUC : [0.17000350082685947, 0.14729706430632306, 0.16304863461822572, 0.17525175579803, 0.13415720235062434] Moyenne : 0.15795163158001252 Écart-type : 0.015167174795692325
In [59]:
#Vérification de la distribution des valeurs
plt.hist(model.predict_proba(X_test)[:,1], bins=50)
plt.show()
In [60]:
from sklearn.metrics import precision_recall_curve
proba = model.predict_proba(X_test)[:,1]
precision, recall, thresholds = precision_recall_curve(y_test_1d, proba)
In [61]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
y_proba = model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test_1d, y_proba)
plt.figure(figsize=(8,6))
plt.plot(recall, precision, linewidth=2)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Courbe Precision-Recall")
plt.grid(True)
plt.show()
In [68]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
thresholds = np.linspace(0, 1, 101)
best_f1 = 0
best_threshold = 0
for t in thresholds:
preds = (y_proba >= t).astype(int)
f1 = f1_score(y_test_1d, preds)
if f1 > best_f1:
best_f1 = f1
best_threshold = t
best_precision = precision_score(y_test_1d, preds)
best_recall = recall_score(y_test_1d, preds)
print(f"Meilleur seuil : {best_threshold:.3f}")
print(f"F1-score : {best_f1:.3f}")
print(f"Precision : {best_precision:.3f}")
print(f"Recall : {best_recall:.3f}")
Meilleur seuil : 0.930 F1-score : 0.395 Precision : 0.581 Recall : 0.299
In [69]:
precision, recall, thresholds = precision_recall_curve(y_test_1d, y_proba)
plt.plot(thresholds, recall[:-1])
plt.xlabel("Seuil")
plt.ylabel("Recall")
plt.title("Recall en fonction du seuil")
plt.grid(True)
plt.show()
In [81]:
#Vérification des variables qui pèsent le plus dans le résultat
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train)
C:\Users\ramat\anaconda3\Lib\site-packages\shap\explainers\_tree.py:587: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray warnings.warn(
Restitution du Challenge¶
In [70]:
#Chargement du jeu de test pour le challenge (Application des étapes précédentes)
df_train=pd.read_csv("X_test_8skS2ey.csv", low_memory=False)
In [71]:
#Ajout d'une colonne "fraud flag" pour correspondre au merge précédent
df_train["fraud_flag"]=0
In [72]:
#Attribution de la valeur risk
df_train["high_risk_item"] = (
df_train[item_cols]
.isin(high_risk_categories)
.any(axis=1)
.astype(int)
)
In [73]:
#Ajout des features
#1.1 - Calcule du nombre items réel présent par ligne
df_train['nb_items_real'] = df_train[item_cols].notna().sum(axis=1)
#1.2 - Identification du nombre d'items uniques par ligne
df_train["nb_items_unique"] = df_train[item_cols].apply(lambda row: row.dropna().nunique(), axis = 1)
#1.3 - Taux de répétion (exclusion des divisions par 0 ou sans valeur)
df_train["ratio_repetition"] = round((df_train["nb_items_unique"] / df_train['nb_items_real']).replace([np.inf, np.nan], 0),2)
#2.1 - Montant total du Panier par ligne
df_train["Montant_total"] = df_train[price_cols].sum(axis=1)
#2.2 - Prix maximum
df_train["Prix_max"] = df_train[price_cols].max(axis=1)
#2.3 - Prix Moyen
df_train["Prix_moyen"] = df_train[price_cols].mean(axis=1)
#2.4 - Dispersion des prix(Ecart Type)
df_train["Prix_std"] = df_train[price_cols].std(axis=1)
#2.5 - Ratio Prix max / Prix total
df_train["Ratio_prix_max"] = round(df_train["Prix_max"]/df_train["Montant_total"].replace([np.inf, np.nan], 0),2)
#2.6 - Prix minimum
df_train["Prix_min"] = df_train[price_cols].min(axis=1)
# 3.1 - Quantités totales
df_train["quantite_total"] = df_train[qty_cols].sum(axis=1)
# 3.2 - Quantités totales
df_train["quantite_moyenne"] = df_train[qty_cols].mean(axis=1)
# 3.3 - Quantités Max
df_train["quantite_max"] = df_train[qty_cols].max(axis=1)
# 4.1 - Diversité des catégories
df_train["nb_goods_unique"] = df_train[goods_cols].nunique(axis=1)
# 4.2 - Diversité des marques
df_train["nb_make_unique"] = df_train[make_cols].nunique(axis=1)
# 4.3 - Diversité des modèles
df_train["nb_model_unique"] = df_train[model_cols].nunique(axis=1)
# 4.4 - Dominance d'une marque
df_train["dominant_make_ratio"] = df_train[make_cols].apply(lambda row: row.value_counts(normalize=True).max(), axis=1)
In [74]:
#Création de X en ne conservant que les nouvelles features
X_challenge=df_train.iloc[:,147:]
#Création de y
y_challenge=df_train.iloc[:,146:147]
In [75]:
#Estimation des proba
y_pred_proba = model.predict_proba(X_challenge)[:, 1]
In [76]:
#Conversion en DataFrame
submission = pd.DataFrame(y_pred_proba,columns =['fraud_flag'])
In [77]:
df_train["fraud_flag"]=pd.DataFrame(y_pred_proba,columns =['fraud_flag'])
In [78]:
#Conservation des colonnes ID et Fraud_flag pour la restitution
submission = df_train[["ID","fraud_flag"]]
In [79]:
#Ajout d'un nom à l'index
submission.index.name = "Index"
In [80]:
#Exportation du DataFrame Challenge
submission.to_csv("submission.csv", index=True)