You've already forked ui-cviko1
329 KiB
329 KiB
In [1]:
%pip install pandas
In [2]:
import pandas
In [17]:
loans = pandas.read_csv('loan_historical_data.csv', sep=";")
In [18]:
loans
Out[18]:
In [5]:
%pip install scikit-learn
In [6]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
In [19]:
arr1 = [1,1,2,2,2,4,4,2,3]
In [20]:
set(arr1)
Out[20]:
In [21]:
list(set(arr1))
Out[21]:
In [22]:
arr2 = ["High", "Low", "Low", "Low", "High"]
In [23]:
list(set(arr2))
Out[23]:
In [24]:
def cat2int(col):
vals = list(set(col))
for i, string in enumerate(col):
col[i] = vals.index(string)
return col
In [25]:
cat2int(loans["Gender"])
Out[25]:
In [26]:
loans
Out[26]:
In [27]:
cat2int(loans["Income"])
cat2int(loans["Credit"])
cat2int(loans["Unemployed"])
Out[27]:
In [28]:
loans
Out[28]:
In [29]:
#split dataset in features and target variable
X = loans[['Income','Credit','Gender','Unemployed']] # Features
y = loans.Safe # Target variable
In [31]:
X
Out[31]:
In [32]:
y
Out[32]:
In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=15) # 70% training and 30% test
In [34]:
X_train
Out[34]:
In [35]:
y_train
Out[35]:
In [37]:
X_test
Out[37]:
In [36]:
y_test
Out[36]:
In [38]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
# Training the model on the data, storing the information learned from the data
# Model is learning the relationship between x (features: Income, Credit, Gender, Unemployed) and y (Safe)
clf = clf.fit(X_train,y_train)
In [39]:
y_pred = clf.predict(X_test)
In [40]:
X_test
Out[40]:
In [41]:
y_pred
Out[41]:
In [42]:
y_test
Out[42]:
In [43]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
In [45]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
In [46]:
y_pred = clf.predict(X_test)
In [47]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
In [50]:
%pip install pydotplus
In [51]:
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image
In [52]:
import pydotplus
In [53]:
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=False,
special_characters=True,feature_names = ['Income','Credit','Gender','Unemployed'],class_names=['no','yes'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('graf.png')
Image(graph.create_png())
Out[53]:
Titanic¶
In [54]:
titanic = pandas.read_csv('titanic_full.csv')
In [55]:
titanic.head()
Out[55]:
In [56]:
titanic.tail()
Out[56]:
In [57]:
titanic.describe()
Out[57]:
In [58]:
titanic.Survived.value_counts()
Out[58]:
In [59]:
titanic.Sex.value_counts()
Out[59]:
In [60]:
titanic.Cabin.value_counts()
Out[60]:
In [61]:
titanic.Embarked.value_counts()
Out[61]:
In [62]:
titanic['Sex'] = titanic['Sex'].replace({'male': 0, 'female': 1})
In [63]:
X = titanic[titanic.columns.difference(['Survived','PassengerId','','Name','Ticket','Cabin','Embarked'])] # Všetko okrem
y = titanic.Survived
In [64]:
X
Out[64]:
In [65]:
y
Out[65]:
In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
In [68]:
clf = DecisionTreeClassifier(max_depth=4)
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
In [69]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
In [70]:
import numpy as np
In [ ]:
person = np.array([ # 23 ročný muž bez detí
23,
100,
0,
1,
1,
0
])
In [72]:
person
Out[72]:
In [73]:
person.reshape(1, -1)
Out[73]:
In [ ]:
print('Prediction: ', clf.predict(person.reshape(1,-1))) # Prežije?
In [75]:
titanic_columns = X_train.columns.to_list()
In [76]:
import matplotlib.pyplot as plt
from sklearn import tree
fig = plt.figure(figsize=(20,6))
_ = tree.plot_tree(clf,
feature_names = titanic_columns,
class_names=['0','1'],
filled=True)
In [ ]: