Files
ui-cviko1/notebook/cviko8.ipynb
2025-04-16 10:33:53 +02:00

329 KiB
Raw Blame History

In [1]:
%pip install pandas
Collecting pandas
  Using cached pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Requirement already satisfied: numpy>=1.26.0 in /home/br0kenpixel/Documents/ui-cviko1/lib64/python3.13/site-packages (from pandas) (2.2.4)
Requirement already satisfied: python-dateutil>=2.8.2 in /home/br0kenpixel/Documents/ui-cviko1/lib64/python3.13/site-packages (from pandas) (2.9.0.post0)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Requirement already satisfied: six>=1.5 in /home/br0kenpixel/Documents/ui-cviko1/lib64/python3.13/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)
Using cached pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
Downloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2025.2 tzdata-2025.2

[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.
In [2]:
import pandas
In [17]:
loans = pandas.read_csv('loan_historical_data.csv', sep=";")
In [18]:
loans
Out[18]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Client Income Credit Gender Unemployed Safe
0 K1 High Excellent Female No Yes
1 K2 High Excellent Man No Yes
2 K3 Low Poor Man No No
3 K4 Low Excellent Female Yes Yes
4 K5 Low Excellent Man Yes Yes
5 K6 Low Poor Female Yes No
6 K7 High Poor Man No Yes
7 K8 High Poor Female Yes Yes
8 K9 Low Fair Man Yes No
9 K10 High Fair Female No Yes
10 K11 Low Fair Female Yes No
11 K12 Low Fair Man No Yes
In [5]:
%pip install scikit-learn
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Requirement already satisfied: numpy>=1.19.5 in /home/br0kenpixel/Documents/ui-cviko1/lib64/python3.13/site-packages (from scikit-learn) (2.2.4)
Requirement already satisfied: scipy>=1.6.0 in /home/br0kenpixel/Documents/ui-cviko1/lib64/python3.13/site-packages (from scikit-learn) (1.15.2)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.2 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.2/13.2 MB 8.3 MB/s eta 0:00:00a 0:00:01
Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 threadpoolctl-3.6.0

[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.
In [6]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
In [19]:
arr1 = [1,1,2,2,2,4,4,2,3]
In [20]:
set(arr1)
Out[20]:
{1, 2, 3, 4}
In [21]:
list(set(arr1))
Out[21]:
[1, 2, 3, 4]
In [22]:
arr2 = ["High", "Low", "Low", "Low", "High"]
In [23]:
list(set(arr2))
Out[23]:
['High', 'Low']
In [24]:
def cat2int(col):
    vals = list(set(col))
    for i, string in enumerate(col):
        col[i] = vals.index(string)
    return col
In [25]:
cat2int(loans["Gender"])
Out[25]:
0     0
1     1
2     1
3     0
4     1
5     0
6     1
7     0
8     1
9     0
10    0
11    1
Name: Gender, dtype: object
In [26]:
loans
Out[26]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Client Income Credit Gender Unemployed Safe
0 K1 High Excellent 0 No Yes
1 K2 High Excellent 1 No Yes
2 K3 Low Poor 1 No No
3 K4 Low Excellent 0 Yes Yes
4 K5 Low Excellent 1 Yes Yes
5 K6 Low Poor 0 Yes No
6 K7 High Poor 1 No Yes
7 K8 High Poor 0 Yes Yes
8 K9 Low Fair 1 Yes No
9 K10 High Fair 0 No Yes
10 K11 Low Fair 0 Yes No
11 K12 Low Fair 1 No Yes
In [27]:
cat2int(loans["Income"])
cat2int(loans["Credit"])
cat2int(loans["Unemployed"])
Out[27]:
0     0
1     0
2     0
3     1
4     1
5     1
6     0
7     1
8     1
9     0
10    1
11    0
Name: Unemployed, dtype: object
In [28]:
loans
Out[28]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Client Income Credit Gender Unemployed Safe
0 K1 0 0 0 0 Yes
1 K2 0 0 1 0 Yes
2 K3 1 1 1 0 No
3 K4 1 0 0 1 Yes
4 K5 1 0 1 1 Yes
5 K6 1 1 0 1 No
6 K7 0 1 1 0 Yes
7 K8 0 1 0 1 Yes
8 K9 1 2 1 1 No
9 K10 0 2 0 0 Yes
10 K11 1 2 0 1 No
11 K12 1 2 1 0 Yes
In [29]:
#split dataset in features and target variable

X = loans[['Income','Credit','Gender','Unemployed']] # Features
y = loans.Safe # Target variable
In [31]:
X
Out[31]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Income Credit Gender Unemployed
0 0 0 0 0
1 0 0 1 0
2 1 1 1 0
3 1 0 0 1
4 1 0 1 1
5 1 1 0 1
6 0 1 1 0
7 0 1 0 1
8 1 2 1 1
9 0 2 0 0
10 1 2 0 1
11 1 2 1 0
In [32]:
y
Out[32]:
0     Yes
1     Yes
2      No
3     Yes
4     Yes
5      No
6     Yes
7     Yes
8      No
9     Yes
10     No
11    Yes
Name: Safe, dtype: object
In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=15) # 70% training and 30% test
In [34]:
X_train
Out[34]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Income Credit Gender Unemployed
9 0 2 0 0
3 1 0 0 1
4 1 0 1 1
0 0 0 0 0
7 0 1 0 1
10 1 2 0 1
5 1 1 0 1
8 1 2 1 1
In [35]:
y_train
Out[35]:
9     Yes
3     Yes
4     Yes
0     Yes
7     Yes
10     No
5      No
8      No
Name: Safe, dtype: object
In [37]:
X_test
Out[37]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Income Credit Gender Unemployed
11 1 2 1 0
6 0 1 1 0
2 1 1 1 0
1 0 0 1 0
In [36]:
y_test
Out[36]:
11    Yes
6     Yes
2      No
1     Yes
Name: Safe, dtype: object
In [38]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
# Training the model on the data, storing the information learned from the data
# Model is learning the relationship between x (features: Income, Credit,	Gender,	Unemployed) and y (Safe)
clf = clf.fit(X_train,y_train)
In [39]:
y_pred = clf.predict(X_test)
In [40]:
X_test
Out[40]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Income Credit Gender Unemployed
11 1 2 1 0
6 0 1 1 0
2 1 1 1 0
1 0 0 1 0
In [41]:
y_pred
Out[41]:
array(['No', 'Yes', 'No', 'Yes'], dtype=object)
In [42]:
y_test
Out[42]:
11    Yes
6     Yes
2      No
1     Yes
Name: Safe, dtype: object
In [43]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
Accuracy: 0.75
In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
In [45]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
In [46]:
y_pred = clf.predict(X_test)
In [47]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
Accuracy: 0.5
In [50]:
%pip install pydotplus
Collecting pydotplus
  Downloading pydotplus-2.0.2.tar.gz (278 kB)
  Installing build dependencies ... done
  Getting requirements to build wheel ... done
  Preparing metadata (pyproject.toml) ... done
Requirement already satisfied: pyparsing>=2.0.1 in /home/br0kenpixel/Documents/ui-cviko1/lib64/python3.13/site-packages (from pydotplus) (3.2.3)
Building wheels for collected packages: pydotplus
  Building wheel for pydotplus (pyproject.toml) ... done
  Created wheel for pydotplus: filename=pydotplus-2.0.2-py3-none-any.whl size=24687 sha256=c477e8981a47f023f93b6f4aa926f3899a8d5302c81197821877eb34780c3280
  Stored in directory: /home/br0kenpixel/.cache/pip/wheels/4a/c0/ed/a9eeeb08c3c53bb90d3822cf76557c8fdcbc349ee11a011169
Successfully built pydotplus
Installing collected packages: pydotplus
Successfully installed pydotplus-2.0.2

[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.
In [51]:
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image
In [52]:
import pydotplus
In [53]:
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
                filled=True, rounded=False,
                special_characters=True,feature_names = ['Income','Credit','Gender','Unemployed'],class_names=['no','yes'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('graf.png')
Image(graph.create_png())
Out[53]:
No description has been provided for this image

Titanic

In [54]:
titanic = pandas.read_csv('titanic_full.csv')
In [55]:
titanic.head()
Out[55]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [56]:
titanic.tail()
Out[56]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.00 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.00 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.45 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.00 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.75 NaN Q
In [57]:
titanic.describe()
Out[57]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
In [58]:
titanic.Survived.value_counts()
Out[58]:
Survived
0    549
1    342
Name: count, dtype: int64
In [59]:
titanic.Sex.value_counts()
Out[59]:
Sex
male      577
female    314
Name: count, dtype: int64
In [60]:
titanic.Cabin.value_counts()
Out[60]:
Cabin
G6             4
C23 C25 C27    4
B96 B98        4
F2             3
D              3
              ..
E17            1
A24            1
C50            1
B42            1
C148           1
Name: count, Length: 147, dtype: int64
In [61]:
titanic.Embarked.value_counts()
Out[61]:
Embarked
S    644
C    168
Q     77
Name: count, dtype: int64
In [62]:
titanic['Sex'] = titanic['Sex'].replace({'male': 0, 'female': 1})
/tmp/ipykernel_4882/3535274200.py:1: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  titanic['Sex'] = titanic['Sex'].replace({'male': 0, 'female': 1})
In [63]:
X = titanic[titanic.columns.difference(['Survived','PassengerId','','Name','Ticket','Cabin','Embarked'])] # Všetko okrem
y = titanic.Survived
In [64]:
X
Out[64]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Age Fare Parch Pclass Sex SibSp
0 22.0 7.2500 0 3 0 1
1 38.0 71.2833 0 1 1 1
2 26.0 7.9250 0 3 1 0
3 35.0 53.1000 0 1 1 1
4 35.0 8.0500 0 3 0 0
... ... ... ... ... ... ...
886 27.0 13.0000 0 2 0 0
887 19.0 30.0000 0 1 1 0
888 NaN 23.4500 2 3 1 1
889 26.0 30.0000 0 1 0 0
890 32.0 7.7500 0 3 0 0

891 rows × 6 columns

In [65]:
y
Out[65]:
0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64
In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
In [68]:
clf = DecisionTreeClassifier(max_depth=4)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
In [69]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
Accuracy: 0.7821229050279329
In [70]:
import numpy as np
In [ ]:
person = np.array([ # 23 ročný muž bez detí
    23,
    100,
    0,
    1,
    1,
    0
])
In [72]:
person
Out[72]:
array([ 23, 100,   0,   1,   1,   0])
In [73]:
person.reshape(1, -1)
Out[73]:
array([[ 23, 100,   0,   1,   1,   0]])
In [ ]:
print('Prediction: ', clf.predict(person.reshape(1,-1))) # Prežije?
Prediction:  [1]
/home/br0kenpixel/Documents/ui-cviko1/lib64/python3.13/site-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
  warnings.warn(
In [75]:
titanic_columns = X_train.columns.to_list()
In [76]:
import matplotlib.pyplot as plt
from sklearn import tree
fig = plt.figure(figsize=(20,6))
_ = tree.plot_tree(clf,
                   feature_names = titanic_columns,
                   class_names=['0','1'],
                   filled=True)
No description has been provided for this image
In [ ]: