Solution
Few Import Pieces of Information we need befor we begin :
-
Info 1 : Decide stopping criteria of the algorithm.
-
if tree height becomes equal to max_height, stop the lgorithm.
-
minimum number of elements in a node needed for split
-
if a node has >= 90% elements from the same class etc.
For now we will use max_height as stopping condition.
-
Info 2 : Decide feature splitting metrics , There are two options
We will use Gini Impurity in this case. Feel free to explore other metrics also.
Algorithm : We will use recursive partition Algorithm
-
Step 1 : Check Base condition (stopping condition)
If stopping condition is True then create a leaf node and assign a output value to the node.
In our implementation all the leaf node will have non null output_value , all the decision nodes (intermediate nodes) will have output_value=None
-
Step 2 : Get best feature and split value for data splitting
-
Step 2(a) : For each feature value Compute Information Gain using Gini Impurity
-
Step 2(b) : Select the threshold which gives maximum Information Gain
Note: Instead of checking all the values you can do feature binning using percentiles, use percentile values as threshold and compute Information Gain.
This will improve the time complexity of the algorithm.
-
Step 3 : Create a Decision Node , set feature and feature_val for the decision node.
-
Compute Left node dataset : All the data points where feature <= feature_val
-
Compute Right node dataset : All the data points where feature > feature_val
-
Step 4 : Recursively create
-
Left Node : use left dataset and build left subtree
-
Right Node : use right dataset and build right subtree
Code
class DecisionTreeNode:
"""
Decision Node : output_val == None
Leaf Node : left == None, right == None, feature == None, feature_val == None
output_val != None
"""
def __init__(self,left, right, feature , feature_val, output_val=None):
self.left = left
self.right = right
self.feature = feature
self.feature_val = feature_val
self.output_val = output_val
class DecisionTreeModel:
def __init__(self, X, feature_name_list, y, max_height):
self.data = X # 2D list (N, k)
self.feature_name_list = feature_name_list
self.label_feature = y # 1 d list of N elements
self.max_height = max_height
self.root = None
def train_dicision_tree(self):
# write code here
self.train_decision_tree_helper(data=self.data, label_feature=self.label_feature, height=0)
return self.root
def train_dicision_tree_helper(self, data, label_feature, height):
if self.check_base_case(label_feature, height):
leaf_output = self.compute_leaf_output(label_feature)
return DecisionTreeNode(None, None, None, None, leaf_output)
best_feature, best_feature_val, best_left_data, best_left_label, best_right_data, best_right_label = self.get_best_feature_split(data, label_feature)
decision_node = DecisionTreeNode(None, None, best_feature, best_feature_val)
if self.root is None:
self.root = decision_node
decision_node.left = self.buil_dicision_tree(best_left_data, best_left_label, height+1)
decision_node.right = self.buil_dicision_tree(best_right_data, best_right_label, height+1)
return decision_node
def check_base_case(self, label_feature, height):
if len(set(label_feature))==1 or height==self.max_height:
return True
else:
return False
def compute_gini(self, label_feature):
pos_count = 0
neg_count = 0
total_count = len(label_feature) + 0.00001
for i in label_feature:
if i==1:
pos_count+=1
else:
neg_count+=1
return 1-(pos_count/total_count)**2-(neg_count/total_count)**2
def compute_information_gain(self, root_labels, left_labels, right_labels):
root_gini = self.compute_gini(root_labels)
left_gini = self.compute_gini(left_labels)
right_gini = self.compute_gini(right_labels)
left_weight = len(left_labels)/len(root_labels)
right_weight = len(right_labels)/len(root_labels)
return root_gini - left_weight*left_gini - right_weight*right_gini
def get_datasets(self, data, feature_index , feature_val, label_feature):
left_data = []
left_label = []
right_data = []
right_label = []
for i in range(len(data)):
if data[i][feature_index]<=feature_val:
left_data.append(data[i])
left_label.append(label_feature[i])
else:
right_data.append(data[i])
right_label.append(label_feature[i])
return left_data, left_label, right_data, right_label
def get_best_feature_split(self, data, label_feature):
best_feature = None
best_feature_val = None
best_information_gain = -1
best_left_data = None
best_left_label = None
best_right_data = None
best_right_label = None
for i, feature in enumerate(self.feature_name_list):
for feature_val in set(data[:][i]):
left_data, left_label, right_data, right_label = self.get_datasets(data, i, feature_val, label_feature)
information_gain = self.compute_information_gain(label_feature, left_label, right_label)
if information_gain>best_information_gain:
best_information_gain = information_gain
best_feature = i
best_feature_val = feature_val
best_left_data = left_data
best_left_label = left_label
best_right_data = right_data
best_right_label = right_label
return best_feature, best_feature_val, best_left_data, best_left_label, best_right_data, best_right_label
def compute_leaf_output(self, label_feature):
pos_count = 0
neg_count = 0
for i in label_feature:
if i==1:
pos_count+=1
else:
neg_count+=1
return pos_count/(pos_count+neg_count)