Segmentation d'une clientèle bancaire

Les objectifs et données de ce tutoriel sont précisément décrits dans la vignette disponible sur le site wikistat.fr. Ce calepin liste les commandes Pyhton produisant les mêmes résultats que le code SAS du scénario initial.

Lecture et prétraitement des données

In [1]:
%matplotlib inline
import pylab as P
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'
In [2]:
#Lecture
listeVar=['MATRIC','DEPTS','PVS','SEXEQ','AGER','FAMIQ', 
 'RELAT','PCSPQ','QUALS','G03G04S','G25G26S', 
 'G29G30S','G35G36S','G37G38S','G45G46S','G47G48S',
 'IMPNBS','REJETS','OPGNB','MOYRV','TAVEP','ENDET','GAGET',
 'GAGEC','GAGEM','KVUNB','QSMOY','QCRED','DMVTP','BOPPN',
 'FACAN','LGAGT','VIENB','VIEMT','UEMNB','UEMMTS','XLGNB',
 'XLGMT','YLVNB','YLVMT','NBELTS','MTELTS','NBCATS','MTCATS',
 'NBBECS','MTBECS','ZOCNB','NTCAS','NPTAG','SEGV2S','ITAVC',
 'HAVEF','JNBJD1S','JNBJD2S','JNBJD3S','CARVP']
path=""
# path="http://www.math.univ-toulouse.fr/~besse/Wikistat/data/"
visaraw = pd.read_table(path+'visa_raw_sas.dat',
                        sep=';',header=None, index_col=0, names=listeVar)
In [3]:
# Fonction pour définir le bon type "categorical" des variables qualitatives 
# avec dictionnaire des modalités pour le recodage
def create_categorical_data(df,column_name, cat_name_dic):
    cat_columns = pd.Categorical(df[column_name],ordered=False)
    new_categorie = [cat_name_dic[old_name] for old_name in cat_columns.categories]
    return cat_columns.rename_categories(new_categorie)
In [4]:
#FAMIQ
cat_name_dic = {'C':'Fcel','D':'Fdiv','M':'Fmar','S':'Fsep','U':'Fuli','V':'Fveu','inc':'Finc'}
visaraw["FAMIQ"]=create_categorical_data(visaraw,"FAMIQ", cat_name_dic)
#SEXEQ
cat_name_dic = {'1':'Shom','2':'Sfem','inc':'Sinc'}
visaraw["SEXEQ"]=create_categorical_data(visaraw,"SEXEQ", cat_name_dic)
#CARVP
cat_name_dic = {'non':'Cnon','oui':'Coui'}
visaraw["CARVP"]=create_categorical_data(visaraw,"CARVP", cat_name_dic)
#PCSPQ
visaraw["PCSPQ"] = visaraw["PCSPQ"].str.get(0).replace("0","i")
cat_name_dic = {'1':'Pagr', '2':'Part', '3':'Pcad', '4':'Pint', '5':'Pemp','6':'Pouv',
                '7':'Pret', '8':'Psan', 'i':'Pinc'}
visaraw["PCSPQ"]=create_categorical_data(visaraw,"PCSPQ", cat_name_dic)
In [5]:
# Suppression d'observations non pertinentes 
# interdits bancaires, comptes professionnels
visaraw = visaraw[visaraw.NBCATS!= 1]
visaraw = visaraw[visaraw.NBBECS!= 1]

visaraw= visaraw[np.logical_not(visaraw.G29G30S.isin(['B','X']))] 
visaraw= visaraw[np.logical_not(visaraw.G03G04S.isin(['B','X']))] 
visaraw= visaraw[np.logical_not(visaraw.G45G46S.isin(['A','B','X']))] 
visaraw= visaraw[np.logical_not(visaraw.G37G38S.isin(['A']))] 
visaraw= visaraw[np.logical_not(visaraw.G25G26S.isin(['A','B','X']))] 
visaraw= visaraw[np.logical_not(visaraw.G47G48S.isin(['B']))] 
In [6]:
# Suppression de données manquantes
visaraw = visaraw[visaraw.AGER!='.']
# Seulement les clients de 18 à 65 ans
visaraw["AGER"]=visaraw["AGER"].astype(int)
visaraw=visaraw[np.logical_and(visaraw["AGER"]<66,visaraw["AGER"]>17)]

visaraw["ZOCNB"]=visaraw["ZOCNB"].replace(".",np.nan)
visaraw["DMVTP"]=visaraw["DMVTP"].replace(".",np.nan)
In [7]:
# Modificaton des types des variables 
listeVarFl=['RELAT','IMPNBS','REJETS','OPGNB','MOYRV',
'TAVEP','ENDET','GAGET','GAGEC','GAGEM','KVUNB','QSMOY','QCRED',
'DMVTP','BOPPN','FACAN','LGAGT','VIENB','VIEMT','UEMNB','UEMMTS','XLGNB',
 'XLGMT','YLVNB','YLVMT','NBELTS','MTELTS','NBCATS','MTCATS',
 'NBBECS','MTBECS','ZOCNB','NTCAS','NPTAG','ITAVC',
 'HAVEF','JNBJD1S','JNBJD2S','JNBJD3S']
visaraw[listeVarFl]=visaraw[listeVarFl].astype(float)
In [8]:
# Un seule variable nombre de jours de débit
visaraw["JNBJD"]=visaraw["JNBJD1S"]+visaraw["JNBJD2S"]+visaraw["JNBJD3S"] 
In [9]:
# suppression des variables devenues inutiles
listeVarSup=["DEPTS","QUALS","G03G04S","G25G26S","G29G30S","G35G36S",
            "G37G38S","G45G46S","G47G48S","SEGV2S","PVS","JNBJD1S","JNBJD2S","JNBJD3S"]
visaraw.drop(listeVarSup,inplace=True,axis=1)
In [10]:
# mise à jour de la liste
listeVar = [x for x in listeVar if x not in listeVarSup]

Desciption univariée et bivariée

In [11]:
visaprem=visaraw
visaprem.mean()
Out[11]:
AGER          42.532153
RELAT        157.117428
IMPNBS         0.000000
REJETS        -0.072693
OPGNB          1.505126
MOYRV         47.626281
TAVEP      57249.062442
ENDET          5.456664
GAGET      77316.455732
GAGEC       4198.657036
GAGEM      20230.041938
KVUNB          1.027959
QSMOY      10674.265610
QCRED         25.906803
DMVTP         19.059044
BOPPN         28.999068
FACAN      23379.273066
LGAGT      52887.756757
VIENB          0.239515
VIEMT      35914.712954
UEMNB          1.473439
UEMMTS     75442.118360
XLGNB          0.652377
XLGMT      32183.942218
YLVNB          0.758621
YLVMT      20739.696179
NBELTS         0.058714
MTELTS      4325.424045
NBCATS         0.002796
MTCATS        19.710158
NBBECS         0.000932
MTBECS     18173.345760
ZOCNB         11.500629
NTCAS          1.069897
NPTAG          0.136067
ITAVC     146819.268406
HAVEF      50727.357875
JNBJD         12.084809
dtype: float64
In [12]:
visaprem["RELAT"].hist(bins=20)
P.show()
In [13]:
visaprem["TAVEP"].hist(bins=20)
P.show()
In [14]:
visaprem["QSMOY"].hist(bins=20)
P.show()
In [15]:
import scipy.stats as stats
stats.probplot(visaprem["QSMOY"], dist="norm", plot=P)
P.show()
In [16]:
visaprem["PCSPQ"].value_counts()
Out[16]:
Pemp    288
Pcad    276
Psan    199
Pint    170
Pouv     85
Part     31
Pret     21
Pinc      2
Pagr      1
dtype: int64
In [17]:
visaprem["FAMIQ"].value_counts()
Out[17]:
Fmar    547
Fcel    360
Fdiv     86
Finc     32
Fuli     25
Fsep     14
Fveu      9
dtype: int64
In [18]:
visaprem.boxplot('QSMOY','CARVP')
P.show()
In [19]:
from statsmodels.graphics.mosaicplot import mosaic
visaprem["SEXEQ"]=visaprem["SEXEQ"].cat.remove_unused_categories()
table=pd.crosstab(visaprem["SEXEQ"],visaprem["CARVP"])
print(table)
mosaic(table.stack())
P.show()
CARVP  Cnon  Coui
SEXEQ            
Shom    365   303
Sfem    349    56
In [20]:
table=pd.crosstab(visaprem["NTCAS"],visaprem["CARVP"])
print(table)
mosaic(table.stack())
P.show()
CARVP  Cnon  Coui
NTCAS            
0       261    17
1       376   143
2        66   146
3        10    42
4         1    10
5         0     1
In [21]:
visaprem.plot('AGER','RELAT',kind='scatter')
P.show()
In [22]:
table=pd.crosstab(visaprem["NTCAS"],visaprem["ZOCNB"].isnull())
print(table)
ZOCNB  False  True 
NTCAS              
0          0    278
1        519      0
2        212      0
3         52      0
4         11      0
5          1      0
In [23]:
#Transformations des variables 
## regroupements de modalités
visatrans=visaprem
FAMIQ_DIC = {"Fmar":'Fcou',"Fuli":'Fcou',"Fdiv":'Fseu','Fveu':'Fseu',
                        "Fsep":'Fseu',"Fcel":'Fseu'}
PCSPQ_DIC = {"Pagr":"Pint",'Part':"Pint","Pret":"Psan","Pinc":"Psan"}

visatrans["PCSPQ"]=visatrans["PCSPQ"].astype(str).replace(PCSPQ_DIC)
visatrans["FAMIQ"]=visatrans["FAMIQ"].astype(str).replace(FAMIQ_DIC)
In [24]:
# complétion de valeurs
visatrans["ZOCNB"].fillna(0, inplace=True)
visatrans["ZOCNB"]
visatrans = visatrans[visatrans.DMVTP.notnull()]
# Correction ancienneté relation
visatrans["RELAT"]= [x-720 if x>600 else x for x in visatrans["RELAT"]]
In [25]:
# log des variables de distribution dissymétrique 
def log1(x):
    return np.log(1+x)
listeVarLog=['OPGNB','MOYRV','TAVEP','ENDET','GAGET','GAGEC',
'GAGEM','QCRED','DMVTP','BOPPN','FACAN','LGAGT',
'VIEMT','XLGMT','YLVMT','ITAVC','HAVEF','JNBJD']
visatrans[listeVarLog]=log1(visatrans[listeVarLog])
visatrans["ZOCNB"]=np.sqrt(visatrans["ZOCNB"])
In [26]:
visatrans2 = visatrans[["SEXEQ","AGER","FAMIQ","RELAT","PCSPQ","OPGNB", "MOYRV","TAVEP", "ENDET", "GAGET","GAGEC", "GAGEM", "KVUNB","QSMOY", "QCRED", "DMVTP","BOPPN", "FACAN", "LGAGT","VIENB", "VIEMT", "UEMNB","XLGNB", "XLGMT", "YLVNB","YLVMT", "ZOCNB", "NPTAG","ITAVC", "HAVEF","JNBJD", "CARVP"]]
visatrans2.head()
Out[26]:
SEXEQ AGER FAMIQ RELAT PCSPQ OPGNB MOYRV TAVEP ENDET GAGET ... XLGNB XLGMT YLVNB YLVMT ZOCNB NPTAG ITAVC HAVEF JNBJD CARVP
MATRIC
148009 Shom 51 Fcou 238 Pcad 0.000000 4.753590 13.461603 1.609438 11.688701 ... 2 13.161865 2 12.110635 3.741657 0 14.080718 13.230263 0.693147 Coui
442153 Shom 52 Fcou 270 Pint 1.609438 9.882264 9.096163 0.000000 0.000000 ... 2 9.096163 0 0.000000 2.236068 0 16.804029 11.804826 0.000000 Coui
552427 Shom 58 Fcou 139 Pcad 0.000000 3.713572 8.132413 0.000000 0.000000 ... 0 0.000000 1 8.132413 0.000000 0 11.717872 0.000000 0.000000 Coui
556005 Shom 27 Fseu 99 Psan 0.000000 2.890372 11.242717 0.000000 0.000000 ... 1 10.743178 2 10.309286 3.741657 0 11.329303 0.000000 0.000000 Coui
556686 Shom 49 Fseu 89 Pemp 0.000000 5.926926 13.067592 0.000000 12.250391 ... 3 12.899220 2 11.203025 3.316625 1 13.111856 9.972267 2.772589 Coui

5 rows × 32 columns

In [27]:
from pandas.tools.plotting import scatter_matrix
scatter_matrix(visatrans[["AGER", "RELAT","QSMOY", "OPGNB", "MOYRV","TAVEP", "ENDET", "GAGET","GAGEC","GAGEM", "QCRED", "DMVTP","BOPPN","FACAN","LGAGT","VIENB","VIEMT","UEMNB","XLGNB","XLGMT","YLVNB","YLVMT","ZOCNB"]], alpha=0.2, figsize=(15, 15), diagonal='kde')
P.show()