Pyton para enfermeiras (23)¶
Prevendo a ocorrência de uma condição clínica¶
In [6]:
# modelo do estudo para prever a ocorrência de diabetes, por exemplo
from IPython.display import Image
Image('Workflow.png')
Out[6]:
In [7]:
# Importando os módulos ou bibliotecas de trabalho
import pandas as pd
import matplotlib as mat
import matplotlib.pyplot as plt
import numpy as np
# criação de gráficos
%matplotlib inline
In [8]:
# Carregando o dataset ou df (que está no diretório do Jupyter Lab)
df = pd.read_csv("pima-data.csv")
In [9]:
# Verificando o formato dos dados
# preparação dos dados
df.shape
Out[9]:
(768, 10)
In [10]:
# Verificando as 5 primeiras linhas do dataset
df.head(5)
Out[10]:
num_preg | glucose_conc | diastolic_bp | thickness | insulin | bmi | diab_pred | age | skin | diabetes | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1.3780 | True |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 1.1426 | False |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 0.0000 | True |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0.9062 | False |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1.3790 | True |
In [11]:
# Verificando as últimas linhas do dataset
df.tail(5)
Out[11]:
num_preg | glucose_conc | diastolic_bp | thickness | insulin | bmi | diab_pred | age | skin | diabetes | |
---|---|---|---|---|---|---|---|---|---|---|
763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 1.8912 | False |
764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 1.0638 | False |
765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0.9062 | False |
766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 0.0000 | True |
767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 1.2214 | False |
In [12]:
# Verificando se existem valores nulos
df.isnull().values.any()
Out[12]:
False
In [13]:
# Identificando a correlação entre as variáveis
# Correlação não implica causalidade
def plot_corr(df, size=10):
corr = df.corr()
fig, ax = plt.subplots(figsize = (size, size))
ax.matshow(corr)
plt.xticks(range(len(corr.columns)), corr.columns)
plt.yticks(range(len(corr.columns)), corr.columns)
In [14]:
# Criando o gráfico sobre a a correlação (ou não) entre as variáveis
plot_corr(df)
In [15]:
# Visualizando a mesma correlação em tabela
# Coeficiente de correlação:
# +1 = forte correlação positiva
# 0 = não há correlação
# -1 = forte correlação negativa
df.corr()
Out[15]:
num_preg | glucose_conc | diastolic_bp | thickness | insulin | bmi | diab_pred | age | skin | diabetes | |
---|---|---|---|---|---|---|---|---|---|---|
num_preg | 1.000000 | 0.129459 | 0.141282 | -0.081672 | -0.073535 | 0.017683 | -0.033523 | 0.544341 | -0.081673 | 0.221898 |
glucose_conc | 0.129459 | 1.000000 | 0.152590 | 0.057328 | 0.331357 | 0.221071 | 0.137337 | 0.263514 | 0.057326 | 0.466581 |
diastolic_bp | 0.141282 | 0.152590 | 1.000000 | 0.207371 | 0.088933 | 0.281805 | 0.041265 | 0.239528 | 0.207371 | 0.065068 |
thickness | -0.081672 | 0.057328 | 0.207371 | 1.000000 | 0.436783 | 0.392573 | 0.183928 | -0.113970 | 1.000000 | 0.074752 |
insulin | -0.073535 | 0.331357 | 0.088933 | 0.436783 | 1.000000 | 0.197859 | 0.185071 | -0.042163 | 0.436785 | 0.130548 |
bmi | 0.017683 | 0.221071 | 0.281805 | 0.392573 | 0.197859 | 1.000000 | 0.140647 | 0.036242 | 0.392574 | 0.292695 |
diab_pred | -0.033523 | 0.137337 | 0.041265 | 0.183928 | 0.185071 | 0.140647 | 1.000000 | 0.033561 | 0.183927 | 0.173844 |
age | 0.544341 | 0.263514 | 0.239528 | -0.113970 | -0.042163 | 0.036242 | 0.033561 | 1.000000 | -0.113973 | 0.238356 |
skin | -0.081673 | 0.057326 | 0.207371 | 1.000000 | 0.436785 | 0.392574 | 0.183927 | -0.113973 | 1.000000 | 0.074750 |
diabetes | 0.221898 | 0.466581 | 0.065068 | 0.074752 | 0.130548 | 0.292695 | 0.173844 | 0.238356 | 0.074750 | 1.000000 |
In [16]:
''' Machine Learning só entende dados numéricos, por isso...preciso usar a função "map"
para converter True/False em um número'''
# Definindo as classes
diabetes_map = {True : 1, False : 0}
In [17]:
# Aplicando o mapeamento ao dataset
df['diabetes'] = df['diabetes'].map(diabetes_map)
In [18]:
# Verificando as primeiras linhas do dataset quanto à alteração
df.head(5)
Out[18]:
num_preg | glucose_conc | diastolic_bp | thickness | insulin | bmi | diab_pred | age | skin | diabetes | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1.3780 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 1.1426 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 0.0000 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0.9062 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1.3790 | 1 |
In [19]:
# Verificando como os dados estão distribuídos
num_true = len(df.loc[df['diabetes'] == True])
num_false = len(df.loc[df['diabetes'] == False])
print("Número de Casos Verdadeiros: {0} ({1:2.2f}%)".format(num_true, (num_true/ (num_true + num_false)) * 100))
print("Número de Casos Falsos : {0} ({1:2.2f}%)".format(num_false, (num_false/ (num_true + num_false)) * 100))
Número de Casos Verdadeiros: 268 (34.90%) Número de Casos Falsos : 500 (65.10%)
Spliting: dividindo os dados em treino (70%) e teste (30%)¶
In [20]:
from IPython.display import Image
Image('Treinamento.png')
Out[20]:
In [21]:
import sklearn as sk
sk.__version__
Out[21]:
'0.23.2'
In [22]:
# uma função que cria treino e teste
from sklearn.model_selection import train_test_split
In [23]:
# Seleção de variáveis preditoras (Feature Selection)...faltou o quesito cor.
atributos = ['num_preg', 'glucose_conc', 'diastolic_bp', 'thickness', 'insulin', 'bmi', 'diab_pred', 'age']
In [24]:
# Variável a ser prevista
atrib_prev = ['diabetes']
In [25]:
# Criando objetos
X = df[atributos].values
Y = df[atrib_prev].values
In [26]:
X
Out[26]:
array([[ 6. , 148. , 72. , ..., 33.6 , 0.627, 50. ], [ 1. , 85. , 66. , ..., 26.6 , 0.351, 31. ], [ 8. , 183. , 64. , ..., 23.3 , 0.672, 32. ], ..., [ 5. , 121. , 72. , ..., 26.2 , 0.245, 30. ], [ 1. , 126. , 60. , ..., 30.1 , 0.349, 47. ], [ 1. , 93. , 70. , ..., 30.4 , 0.315, 23. ]])
In [27]:
Y
Out[27]:
array([[1], [0], [1], [0], [1], [0], [1], [0], [1], [1], [0], [1], [0], [1], [1], [1], [1], [1], [0], [1], [0], [0], [1], [1], [1], [1], [1], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [1], [1], [1], [0], [0], [0], [1], [0], [1], [0], [0], [1], [0], [0], [0], [0], [1], [0], [0], [1], [0], [0], [0], [0], [1], [0], [0], [1], [0], [1], [0], [0], [0], [1], [0], [1], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [1], [0], [0], [0], [1], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [1], [1], [0], [0], [0], [0], [0], [0], [0], [0], [1], [1], [1], [0], [0], [1], [1], [1], [0], [0], [0], [1], [0], [0], [0], [1], [1], [0], [0], [1], [1], [1], [1], [1], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [0], [1], [0], [1], [1], [0], [0], [0], [1], [0], [0], [0], [0], [1], [1], [0], [0], [0], [0], [1], [1], [0], [0], [0], [1], [0], [1], [0], [1], [0], [0], [0], [0], [0], [1], [1], [1], [1], [1], [0], [0], [1], [1], [0], [1], [0], [1], [1], [1], [0], [0], [0], [0], [0], [0], [1], [1], [0], [1], [0], [0], [0], [1], [1], [1], [1], [0], [1], [1], [1], [1], [0], [0], [0], [0], [0], [1], [0], [0], [1], [1], [0], [0], [0], [1], [1], [1], [1], [0], [0], [0], [1], [1], [0], [1], [0], [0], [0], [0], [0], [0], [0], [0], [1], [1], [0], [0], [0], [1], [0], [1], [0], [0], [1], [0], [1], [0], [0], [1], [1], [0], [0], [0], [0], [0], [1], [0], [0], [0], [1], [0], [0], [1], [1], [0], [0], [1], [0], [0], [0], [1], [1], [1], [0], [0], [1], [0], [1], [0], [1], [1], [0], [1], [0], [0], [1], [0], [1], [1], [0], [0], [1], [0], [1], [0], [0], [1], [0], [1], [0], [1], [1], [1], [0], [0], [1], [0], [1], [0], [0], [0], [1], [0], [0], [0], [0], [1], [1], [1], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [1], [1], [1], [0], [1], [1], [0], [0], [1], [0], [0], [1], [0], [0], [1], [1], [0], [0], [0], [0], [1], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [1], [1], [1], [0], [0], [1], [0], [0], [1], [0], [0], [1], [0], [1], [1], [0], [1], [0], [1], [0], [1], [0], [1], [1], [0], [0], [0], [0], [1], [1], [0], [1], [0], [1], [0], [0], [0], [0], [1], [1], [0], [1], [0], [1], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [1], [0], [0], [1], [1], [1], [0], [0], [1], [0], [0], [1], [0], [0], [0], [1], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [1], [0], [0], [0], [1], [1], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [1], [0], [0], [0], [1], [0], [0], [0], [1], [0], [0], [0], [1], [0], [0], [0], [0], [1], [1], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [1], [1], [1], [1], [0], [0], [1], [1], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [1], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [1], [0], [1], [1], [0], [0], [0], [1], [0], [1], [0], [1], [0], [1], [0], [1], [0], [0], [1], [0], [0], [1], [0], [0], [0], [0], [1], [1], [0], [1], [0], [0], [0], [0], [1], [1], [0], [1], [0], [0], [0], [1], [1], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [1], [0], [0], [1], [0], [0], [0], [1], [0], [0], [0], [1], [1], [1], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [1], [0], [1], [1], [1], [1], [0], [1], [1], [0], [0], [0], [0], [0], [0], [0], [1], [1], [0], [1], [0], [0], [1], [0], [1], [0], [0], [0], [0], [0], [1], [0], [1], [0], [1], [0], [1], [1], [0], [0], [0], [0], [1], [1], [0], [0], [0], [1], [0], [1], [1], [0], [0], [1], [0], [0], [1], [1], [0], [0], [1], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [1], [1], [1], [0], [0], [0], [0], [0], [0], [1], [1], [0], [0], [1], [0], [0], [1], [0], [1], [1], [1], [0], [0], [1], [1], [1], [0], [1], [0], [1], [0], [1], [0], [0], [0], [0], [1], [0]], dtype=int64)
In [28]:
# Definindo a taxa de split
split_test_size = 0.30
In [29]:
# Criando dados de treino e de teste
X_treino, X_teste, Y_treino, Y_teste = train_test_split(X, Y, test_size = split_test_size, random_state = 42)
In [30]:
# Imprimindo os resultados
print("{0:0.2f}% nos dados de treino".format((len(X_treino)/len(df.index)) * 100))
print("{0:0.2f}% nos dados de teste".format((len(X_teste)/len(df.index)) * 100))
69.92% nos dados de treino 30.08% nos dados de teste
In [31]:
X_treino
Out[31]:
array([[ 1. , 95. , 60. , ..., 23.9 , 0.26 , 22. ], [ 5. , 105. , 72. , ..., 36.9 , 0.159, 28. ], [ 0. , 135. , 68. , ..., 42.3 , 0.365, 24. ], ..., [ 10. , 101. , 86. , ..., 45.6 , 1.136, 38. ], [ 0. , 141. , 0. , ..., 42.4 , 0.205, 29. ], [ 0. , 125. , 96. , ..., 22.5 , 0.262, 21. ]])
In [32]:
X_teste
Out[32]:
array([[6.00e+00, 9.80e+01, 5.80e+01, ..., 3.40e+01, 4.30e-01, 4.30e+01], [2.00e+00, 1.12e+02, 7.50e+01, ..., 3.57e+01, 1.48e-01, 2.10e+01], [2.00e+00, 1.08e+02, 6.40e+01, ..., 3.08e+01, 1.58e-01, 2.10e+01], ..., [0.00e+00, 1.27e+02, 8.00e+01, ..., 3.63e+01, 8.04e-01, 2.30e+01], [6.00e+00, 1.05e+02, 7.00e+01, ..., 3.08e+01, 1.22e-01, 3.70e+01], [5.00e+00, 7.70e+01, 8.20e+01, ..., 3.58e+01, 1.56e-01, 3.50e+01]])
In [33]:
Y_treino
Out[33]:
array([[0], [0], [1], [0], [0], [0], [0], [1], [1], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [1], [0], [1], [0], [0], [0], [0], [1], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [1], [0], [0], [0], [1], [0], [0], [0], [1], [1], [1], [0], [0], [1], [1], [0], [0], [0], [0], [1], [0], [1], [0], [0], [1], [1], [0], [0], [1], [1], [0], [0], [0], [0], [0], [0], [1], [0], [1], [1], [0], [0], [0], [1], [0], [0], [0], [1], [1], [1], [0], [1], [0], [0], [0], [0], [0], [0], [0], [0], [1], [1], [0], [1], [1], [0], [0], [1], [0], [0], [1], [1], [1], [0], [0], [0], [0], [0], [1], [0], [1], [0], [0], [0], [0], [0], [0], [0], [1], [1], [0], [0], [0], [0], [0], [0], [0], [1], [0], [1], [1], [1], [0], [0], [0], [0], [0], [1], [1], [1], [1], [0], [0], [1], [0], [0], [1], [1], [0], [1], [1], [0], [1], [0], [0], [0], [1], [0], [1], [0], [1], [0], [0], [0], [1], [1], [0], [1], [0], [0], [1], [0], [1], [0], [0], [1], [0], [0], [1], [0], [0], [1], [0], [1], [1], [0], [0], [0], [0], [1], [1], [0], [1], [0], [0], [1], [0], [1], [0], [1], [0], [0], [0], [0], [1], [1], [1], [1], [1], [0], [0], [0], [0], [1], [1], [0], [0], [0], [1], [0], [1], [0], [0], [0], [0], [0], [1], [1], [0], [1], [1], [1], [1], [1], [0], [0], [0], [0], [0], [1], [1], [0], [1], [1], [0], [0], [1], [0], [1], [0], [0], [0], [0], [0], [0], [1], [0], [1], [0], [0], [0], [1], [0], [1], [1], [1], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [1], [0], [1], [0], [0], [1], [0], [1], [0], [0], [0], [1], [0], [0], [1], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [0], [1], [1], [0], [0], [0], [1], [0], [0], [1], [0], [1], [0], [1], [1], [0], [0], [0], [0], [0], [1], [1], [1], [1], [0], [0], [0], [1], [1], [1], [1], [0], [1], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [1], [0], [0], [0], [1], [0], [1], [1], [0], [0], [1], [1], [0], [0], [1], [0], [0], [0], [0], [0], [0], [1], [1], [0], [1], [0], [0], [1], [1], [0], [1], [1], [0], [0], [1], [1], [0], [0], [0], [0], [0], [1], [1], [0], [1], [0], [0], [1], [1], [1], [0], [0], [0], [1], [1], [0], [0], [1], [1], [0], [0], [0], [0], [1], [1], [0], [1], [0], [0], [0], [1], [0], [1], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [1], [0], [1], [0], [1], [0], [0], [0], [1], [0], [0], [1], [0], [0], [1], [1], [1], [1], [0], [1], [0], [0], [0], [1], [0], [0], [1], [1], [0], [0], [1], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [1], [1], [1], [1], [1], [0], [0], [1], [0], [1], [0], [0], [1], [0], [0], [0], [0], [1], [1], [0]], dtype=int64)
In [34]:
Y_teste
Out[34]:
array([[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [0], [1], [0], [0], [1], [0], [0], [1], [1], [0], [0], [0], [0], [0], [1], [0], [0], [1], [0], [1], [1], [1], [1], [0], [1], [1], [1], [0], [1], [0], [0], [0], [1], [0], [1], [1], [0], [0], [0], [0], [1], [1], [1], [0], [0], [0], [0], [0], [1], [1], [0], [0], [1], [0], [0], [0], [1], [0], [1], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [1], [1], [0], [0], [0], [1], [1], [1], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [1], [0], [0], [0], [1], [0], [1], [0], [0], [1], [1], [1], [0], [0], [0], [1], [0], [0], [1], [0], [0], [0], [0], [0], [1], [0], [0], [0], [1], [1], [1], [0], [1], [1], [0], [1], [1], [0], [1], [1], [1], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [1], [0], [0], [1], [0], [0], [1], [0], [0], [1], [0], [1], [1], [0], [0], [0], [1], [0], [0], [0], [1], [0], [1], [1], [0], [0], [0], [0], [0], [1], [1], [0], [0], [0], [1], [1], [0], [0], [0], [1], [0], [1], [0], [1], [0], [0], [0], [0], [0], [1], [1], [0], [0], [1], [0], [0], [0], [0], [0], [1], [1], [0], [0], [0], [0], [1], [0], [0], [1], [0], [0], [1], [0], [0], [0], [0], [0], [1], [1], [0], [0], [0]], dtype=int64)
Verificando o split¶
In [35]:
print("Original True : {0} ({1:0.2f}%)".format(len(df.loc[df['diabetes'] == 1]),
(len(df.loc[df['diabetes'] ==1])/len(df.index) * 100)))
print("Original False : {0} ({1:0.2f}%)".format(len(df.loc[df['diabetes'] == 0]),
(len(df.loc[df['diabetes'] == 0])/len(df.index) * 100)))
print("")
print("Training True : {0} ({1:0.2f}%)".format(len(Y_treino[Y_treino[:] == 1]),
(len(Y_treino[Y_treino[:] == 1])/len(Y_treino) * 100)))
print("Training False : {0} ({1:0.2f}%)".format(len(Y_treino[Y_treino[:] == 0]),
(len(Y_treino[Y_treino[:] == 0])/len(Y_treino) * 100)))
print("")
print("Test True : {0} ({1:0.2f}%)".format(len(Y_teste[Y_teste[:] == 1]),
(len(Y_teste[Y_teste[:] == 1])/len(Y_teste) * 100)))
print("Test False : {0} ({1:0.2f}%)".format(len(Y_teste[Y_teste[:] == 0]),
(len(Y_teste[Y_teste[:] == 0])/len(Y_teste) * 100)))
Original True : 268 (34.90%) Original False : 500 (65.10%) Training True : 188 (35.01%) Training False : 349 (64.99%) Test True : 80 (34.63%) Test False : 151 (65.37%)
Valores missing ou ocultos¶
In [36]:
df.head(5)
Out[36]:
num_preg | glucose_conc | diastolic_bp | thickness | insulin | bmi | diab_pred | age | skin | diabetes | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1.3780 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 1.1426 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 0.0000 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0.9062 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1.3790 | 1 |
In [37]:
print("# Linhas no dataframe {0}".format(len(df)))
print("# Linhas missing glucose_conc: {0}".format(len(df.loc[df['glucose_conc'] == 0])))
print("# Linhas missing diastolic_bp: {0}".format(len(df.loc[df['diastolic_bp'] == 0])))
print("# Linhas missing thickness: {0}".format(len(df.loc[df['thickness'] == 0])))
print("# Linhas missing insulin: {0}".format(len(df.loc[df['insulin'] == 0])))
print("# Linhas missing bmi: {0}".format(len(df.loc[df['bmi'] == 0])))
print("# Linhas missing age: {0}".format(len(df.loc[df['age'] == 0])))
# Linhas no dataframe 768 # Linhas missing glucose_conc: 5 # Linhas missing diastolic_bp: 35 # Linhas missing thickness: 227 # Linhas missing insulin: 374 # Linhas missing bmi: 11 # Linhas missing age: 0
Tratando Dados Missing - Impute¶
In [45]:
# corrigir o dataset quanto ao valor missing oculto
from sklearn.impute import SimpleImputer
In [46]:
# Criando objeto
preenche_0 = SimpleImputer(missing_values = 0, strategy = "mean")
# Substituindo os valores iguais a zero, pela média dos dados
X_treino = preenche_0.fit_transform(X_treino)
X_teste = preenche_0.fit_transform(X_teste)
In [47]:
X_treino
Out[47]:
array([[ 1. , 95. , 60. , ..., 23.9 , 0.26 , 22. ], [ 5. , 105. , 72. , ..., 36.9 , 0.159 , 28. ], [ 4.34056399, 135. , 68. , ..., 42.3 , 0.365 , 24. ], ..., [ 10. , 101. , 86. , ..., 45.6 , 1.136 , 38. ], [ 4.34056399, 141. , 72.24131274, ..., 42.4 , 0.205 , 29. ], [ 4.34056399, 125. , 96. , ..., 22.5 , 0.262 , 21. ]])
In [48]:
X_teste
Out[48]:
array([[6.00000000e+00, 9.80000000e+01, 5.80000000e+01, ..., 3.40000000e+01, 4.30000000e-01, 4.30000000e+01], [2.00000000e+00, 1.12000000e+02, 7.50000000e+01, ..., 3.57000000e+01, 1.48000000e-01, 2.10000000e+01], [2.00000000e+00, 1.08000000e+02, 6.40000000e+01, ..., 3.08000000e+01, 1.58000000e-01, 2.10000000e+01], ..., [4.85714286e+00, 1.27000000e+02, 8.00000000e+01, ..., 3.63000000e+01, 8.04000000e-01, 2.30000000e+01], [6.00000000e+00, 1.05000000e+02, 7.00000000e+01, ..., 3.08000000e+01, 1.22000000e-01, 3.70000000e+01], [5.00000000e+00, 7.70000000e+01, 8.20000000e+01, ..., 3.58000000e+01, 1.56000000e-01, 3.50000000e+01]])
Construindo e treinando o modelo de ML¶
In [56]:
# Utilizando um classificador Naive Bayes
from sklearn.naive_bayes import GaussianNB
In [57]:
# Criando o modelo preditivo
modelo_v1 = GaussianNB()
In [58]:
# Treinando o modelo
modelo_v1.fit(X_treino, Y_treino.ravel())
Out[58]:
GaussianNB()
é exato no treino???¶
In [59]:
from sklearn import metrics
In [60]:
nb_predict_train = modelo_v1.predict(X_treino) # X são as variáveis preditoras, de entrada
In [61]:
# avaliar a acurácia do modelo
print("Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_treino, nb_predict_train)))
print()
Exatidão (Accuracy): 0.7542
In [62]:
# avaliar o teste
nb_predict_test = modelo_v1.predict(X_teste)
In [63]:
print("Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_teste, nb_predict_test)))
print()
Exatidão (Accuracy): 0.7359
Os dados de teste são novos dados e mostra uma acurácia mais precisa
Métricas¶
In [64]:
from IPython.display import Image
Image('ConfusionMatrix.jpg')
Out[64]:
In [65]:
# matriz para avaliar as taxas de erro do modelo
# Criando uma Confusion Matrix
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(Y_teste, nb_predict_test, labels = [1, 0])))
print("")
print("Classification Report")
print(metrics.classification_report(Y_teste, nb_predict_test, labels = [1, 0]))
Confusion Matrix [[ 52 28] [ 33 118]] Classification Report precision recall f1-score support 1 0.61 0.65 0.63 80 0 0.81 0.78 0.79 151 accuracy 0.74 231 macro avg 0.71 0.72 0.71 231 weighted avg 0.74 0.74 0.74 231
Otimizando o modelo com RandomForest: algoritmo de árvore de decisão¶
In [66]:
from sklearn.ensemble import RandomForestClassifier
In [68]:
modelo_v2 = RandomForestClassifier(random_state = 42)
modelo_v2.fit(X_treino, Y_treino.ravel())
Out[68]:
RandomForestClassifier(random_state=42)
In [69]:
# Verificando os dados de treino
rf_predict_train = modelo_v2.predict(X_treino)
print("Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_treino, rf_predict_train)))
Exatidão (Accuracy): 1.0000
In [70]:
# Verificando nos dados de teste
rf_predict_test = modelo_v2.predict(X_teste)
print("Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_teste, rf_predict_test)))
print()
Exatidão (Accuracy): 0.7403
In [71]:
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(Y_teste, rf_predict_test, labels = [1, 0])))
print("")
print("Classification Report")
print(metrics.classification_report(Y_teste, rf_predict_test, labels = [1, 0]))
Confusion Matrix [[ 52 28] [ 32 119]] Classification Report precision recall f1-score support 1 0.62 0.65 0.63 80 0 0.81 0.79 0.80 151 accuracy 0.74 231 macro avg 0.71 0.72 0.72 231 weighted avg 0.74 0.74 0.74 231
Regressão logística: um algoritmo de classificação¶
In [72]:
from sklearn.linear_model import LogisticRegression
In [73]:
# Terceira versão do modelo usando Regressão Logística
modelo_v3 = LogisticRegression(C = 0.7, random_state = 42, max_iter = 1000)
modelo_v3.fit(X_treino, Y_treino.ravel())
lr_predict_test = modelo_v3.predict(X_teste)
In [74]:
print("Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_teste, lr_predict_test)))
print()
print("Classification Report")
print(metrics.classification_report(Y_teste, lr_predict_test, labels = [1, 0]))
Exatidão (Accuracy): 0.7359 Classification Report precision recall f1-score support 1 0.63 0.59 0.61 80 0 0.79 0.81 0.80 151 accuracy 0.74 231 macro avg 0.71 0.70 0.70 231 weighted avg 0.73 0.74 0.73 231
Previsões Com o Modelo Treinado¶
In [76]:
import pickle
In [77]:
# Salvando o modelo para usar mais tarde
filename = 'modelo_treinado_v3.sav'
pickle.dump(modelo_v3, open(filename, 'wb'))
In [78]:
X_teste
Out[78]:
array([[6.00000000e+00, 9.80000000e+01, 5.80000000e+01, ..., 3.40000000e+01, 4.30000000e-01, 4.30000000e+01], [2.00000000e+00, 1.12000000e+02, 7.50000000e+01, ..., 3.57000000e+01, 1.48000000e-01, 2.10000000e+01], [2.00000000e+00, 1.08000000e+02, 6.40000000e+01, ..., 3.08000000e+01, 1.58000000e-01, 2.10000000e+01], ..., [4.85714286e+00, 1.27000000e+02, 8.00000000e+01, ..., 3.63000000e+01, 8.04000000e-01, 2.30000000e+01], [6.00000000e+00, 1.05000000e+02, 7.00000000e+01, ..., 3.08000000e+01, 1.22000000e-01, 3.70000000e+01], [5.00000000e+00, 7.70000000e+01, 8.20000000e+01, ..., 3.58000000e+01, 1.56000000e-01, 3.50000000e+01]])
In [83]:
# Carregando o modelo e fazendo previsão com novos conjuntos de dados
# (X_teste, Y_teste devem ser novos conjuntos de dados preparados com o procedimento de limpeza e transformação adequados)
loaded_model = pickle.load(open(filename, 'rb'))
resultado1 = loaded_model.predict(X_teste[15].reshape(1, -1))
resultado2 = loaded_model.predict(X_teste[18].reshape(1, -1))
print(resultado1)
print(resultado2)
[0] [1]
Apontamentos
- Não há apontamentos.
BNN - ISSN 1676-4893
Boletim do Núcleo de Estudos e Pesquisas sobre as Atividades de Enfermagem (NEPAE)e do Núcleo de Estudos sobre Saúde e Etnia Negra (NESEN).