-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchapter7.html
More file actions
1383 lines (1366 loc) · 232 KB
/
chapter7.html
File metadata and controls
1383 lines (1366 loc) · 232 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Another Book on Data Science - Machine Learning - A gentle introduction</title>
<meta name="description" content="data science, R, Python, programming, machine learning">
<meta name="author" content="Nailong Zhang">
<!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
<!--[if lt IE 9]>
<script src="https://html5shim.googlecode.com/svn/trunk/html5.js"></script>
<![endif]-->
<!-- Le styles -->
<link rel="stylesheet" href="bootstrap-1.1.0.min.css">
<link rel="stylesheet" href="style.css">
<link rel="stylesheet" href="small-screens.css">
<link rel="stylesheet" href="vs.css">
<link rel="stylesheet" href="code.css">
<link rel="stylesheet" href="application.css">
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-142297640-1', 'anotherbookondatascience.com');
ga('send', 'pageview');
</script>
</head>
<body>
<div class="topbar">
<div class="fill">
<div class="container-fluid fixed">
<h3><a href="index.html">Machine Learning - A gentle introduction</a></h3>
<ul class="nav secondary-nav">
<li><a href="chapter6.html">«Previous</a></li>
</ul>
</div>
</div>
</div>
<div class="container-fluid" style="padding-top: 60px;">
<p>Sections in this Chapter:</p>
<ul>
<li><a href="#sl">Supervised learning</a></li>
<li><a href="#gbm">Gradient boosting machine</a></li>
<li><a href="#ul">Unsupervised learning</a></li>
<li><a href="#rl">Reinforcement learning</a></li>
<li><a href="#dqn">Deep Q-networks</a></li>
<li><a href="#cd">Computational differentiation</a></li>
</ul>
<p>Machine learning is such a huge topic that we would only touch on a bit in this book. There are different learning paradigms.</p>
<p>Basically, supervised learning refers the learning task in which we have input data and output data and the purpose is to learn a functional map from the input data to the output data. In contrast to supervised learning, there is unsupervised learning in which only the input data in available. In addition to supervised/unsupervised learning, there are also other learning paradigms, such as reinforcement learning <sup class="footnote" id="fnr20"><a href="#fn20">20</a></sup> and transfer learning <sup class="footnote" id="fnr21"><a href="#fn21">21</a></sup>.</p>
<p>The is no strict distinction between different learning paradigms. For example, semi-supervised learning <sup class="footnote" id="fnr22"><a href="#fn22">22</a></sup> falls between supervised learning and unsupervised learning, and supervised learning paradigm can also be found inside some modern reinforcement learning algorithms.</p>
<h2 id="sl">Supervised learning</h2>
<p>In this book, we have talked about quite a few supervised learning models, such as linear regression and its variants, logistic regression. We will also spend some effort in tree-based models in this chapter. There are many other interest supervised learning models that we don’t cover in this book, such as support vector machine <sup class="footnote" id="fnr23"><a href="#fn23">23</a></sup>, linear discriminant analysis <sup class="footnote" id="fnr24"><a href="#fn24">24</a></sup>. Almost all well-known supervised learning models’ implementations can be found online and I recommend learning from reading the source code. As for the usage, there is an abundance of off-the-shelf options in both R and Python. Algorithm-wise, my favorite supervised learning models include linear models, gradient boosting trees, and (deep) neural network models. Linear models are simple with great interpretability; and it is rare if gradient boosting tree models or deep neural network models cannot match the prediction accuracy of other models for a specific problem.</p>
<h3>Population & random sample</h3>
<p>A population is a complete set of elements of interest for a specific problem. It is usually defined by the researcher of the problem. A population can be either finite or infinite. For example, if we define the set of real numbers as our population, it is infinite. But if we are only interested in the integers between 1 and 100, then we get a finite population.</p>
<p>A random sample is a set of elements selected from a population. The size of a random sample could be larger than the population since an element can be taken multiple times.<br />
Why do we care random samples? Because we are interested in the population from which the random sample is taken, and the random sample could help to make inference on the population.</p>
<p>In predictive modeling, each element in the population has a set of attributes which are usually called features or covariates, as well as a label. For example, a bank may use the mortgage applicant’s personal information (<span class="caps">FICO</span> score, years of employment, debt to income ratio, etc.) as the covariates, and the status of the mortgage (default, or paid off) as the label. A predictive model can be used to predict the final status of the mortgage for a new applicant, and such kind of models are classification models. When a mortgage is in default status, the applicant may have already made payments partially. Thus, a model to predict the amount of loss for the bank is also useful, and such kind of models are regression models.</p>
<p>But why do we need a predictive model? If we know the labels of the entire population, nothing is needed to learn. All what we need is a database table or a dictionary (HashMap) for lookup. The issue is that many problems in the real world don’t allow us to have the labels of the entire population. And thus, we need to learn or infer based on the random sample collected from the unknown population.</p>
<h2 id="approximator">Universal approximation & overfitting</h2>
<h3>Universal approximation</h3>
<p>The Universal approximation theorem says that a single hidden layer neural network can approximate any continuous functions ($\mathbf{R}^n\rightarrow\mathbf{R}$) with sufficient number of neurons under mild assumptions on the activation function (for example, the sigmoid activation function)<sup class="footnote" id="fnr1"><a href="#fn1">1</a></sup>. There are also other universal approximators, such as the decision trees.</p>
<p>Not all machine learning models can approximate universally, for example, the linear regression without polynomial items. If we have the data from the entire population, we may fit the population with a universal approximator. But as we have discussed earlier, when the entire population is available there is no need to fit a machine learning model for prediction. But if only a random sample is available, is a universal approximator still the best choice? It depends.</p>
<h3>Overfitting & cross-validation</h3>
<p>One of the risks of fitting a random sample with a predictive model is overfitting (see the figure below). Using universal approximators as the predictive model may even amplify such risk.</p>
<figure class="text-center">
<img src="figures/overfit.png" alt="A random sample from a population that follows a linear model (the dashed line) is overfit by the solid curve." style="display: block;margin: auto;" width="50%">
<figcaption class="centerfigcaption">A random sample from a population that follows a linear model (the dashed line) is overfit by the solid curve.</figcaption>
</figure>
<p>To mediate the risk of overfitting, we usually use cross-validation to assess how accurately a predictive model is able to predict for unseen data. The following steps specifies how to perform a cross-validation.</p>
<ul>
<li>divide the training data into $k$ partitions randomly</li>
<li>for $i=1,…,k$<br />
- train a model using all partitions except partition $i$<br />
- record the prediction accuracy of the trained model on partition $i$</li>
<li>calculate the average prediction accuracy.</li>
</ul>
<figure class="text-center">
<img src="figures/cv.png" alt="$k$ fold cross-validation." style="display: block;margin: auto;" width="50%">
<figcaption class="centerfigcaption">$k$ fold cross-validation.</figcaption>
</figure>
<p>Cross-validation can also be considered as a metaheuristic algorithm since it is not problem-specific because it doesn’t matter what type of predictive models we use.</p>
<p>There are some ready-to-use tools in R and Python modules for cross-validation. But for pedagogical purpose, let’s do it step by step. We do the cross validation using the Lasso regression we build in previous Chapter on the Boston dataset.</p>
<language>R</language>
<figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="lineno"> 1 </span><span class="kn">source</span><span class="p">(</span><span class="s">'../chapter5/lasso.R'</span><span class="p">)</span>
<span class="lineno"> 2 </span>
<span class="lineno"> 3 </span><span class="kn">library</span><span class="p">(</span>caret<span class="p">)</span>
<span class="lineno"> 4 </span><span class="kn">library</span><span class="p">(</span>MASS<span class="p">)</span>
<span class="lineno"> 5 </span><span class="kn">library</span><span class="p">(</span>Metrics<span class="p">)</span> <span class="c1"># we use the rmse function from this package</span>
<span class="lineno"> 6 </span>k <span class="o">=</span> <span class="m">5</span>
<span class="lineno"> 7 </span>
<span class="lineno"> 8 </span><span class="kp">set.seed</span><span class="p">(</span><span class="m">42</span><span class="p">)</span>
<span class="lineno"> 9 </span><span class="c1"># if we set returnTrain = TRUE, we get the indices for train partition</span>
<span class="lineno">10 </span>test_indices <span class="o">=</span> createFolds<span class="p">(</span>Boston<span class="o">$</span>medv<span class="p">,</span> k <span class="o">=</span> k<span class="p">,</span> <span class="kt">list</span> <span class="o">=</span> <span class="kc">TRUE</span><span class="p">,</span> returnTrain <span class="o">=</span> <span class="kc">FALSE</span><span class="p">)</span>
<span class="lineno">11 </span>scores <span class="o">=</span> <span class="kp">rep</span><span class="p">(</span><span class="kc">NA</span><span class="p">,</span> k<span class="p">)</span>
<span class="lineno">12 </span>
<span class="lineno">13 </span><span class="kr">for</span> <span class="p">(</span>i <span class="kr">in</span> <span class="m">1</span><span class="o">:</span>k<span class="p">){</span>
<span class="lineno">14 </span> lr <span class="o">=</span> Lasso<span class="o">$</span>new<span class="p">(</span><span class="m">200</span><span class="p">)</span>
<span class="lineno">15 </span> <span class="c1"># we exclude the indices for test partition and train the model</span>
<span class="lineno">16 </span> lr<span class="o">$</span>fit<span class="p">(</span><span class="kp">data.matrix</span><span class="p">(</span>Boston<span class="p">[</span><span class="o">-</span>test_indices<span class="p">[[</span>i<span class="p">]],</span> <span class="o">-</span><span class="kp">ncol</span><span class="p">(</span>Boston<span class="p">)]),</span> Boston<span class="o">$</span>medv<span class="p">[</span><span class="o">-</span>test_indices<span class="p">[[</span>i<span class="p">]]],</span> <span class="m">100</span><span class="p">)</span>
<span class="lineno">17 </span> y_hat <span class="o">=</span> lr<span class="o">$</span>predict<span class="p">(</span><span class="kp">data.matrix</span><span class="p">(</span>Boston<span class="p">[</span>test_indices<span class="p">[[</span>i<span class="p">]],</span> <span class="o">-</span><span class="kp">ncol</span><span class="p">(</span>Boston<span class="p">)]))</span>
<span class="lineno">18 </span> scores<span class="p">[</span>i<span class="p">]</span> <span class="o">=</span> rmse<span class="p">(</span>Boston<span class="o">$</span>medv<span class="p">[</span>test_indices<span class="p">[[</span>i<span class="p">]]],</span> y_hat<span class="p">)</span>
<span class="lineno">19 </span><span class="p">}</span>
<span class="lineno">20 </span><span class="kp">print</span><span class="p">(</span><span class="kp">mean</span><span class="p">(</span>scores<span class="p">))</span></code></pre></figure><language>Python</language>
<figure class="highlight"><pre><code class="language-python3" data-lang="python3"><span></span><span class="lineno"> 1 </span><span class="kn">import</span> <span class="nn">sys</span>
<span class="lineno"> 2 </span><span class="n">sys</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">".."</span><span class="p">)</span>
<span class="lineno"> 3 </span>
<span class="lineno"> 4 </span><span class="kn">from</span> <span class="nn">sklearn.metrics</span> <span class="k">import</span> <span class="n">mean_squared_error</span>
<span class="lineno"> 5 </span><span class="kn">from</span> <span class="nn">sklearn.datasets</span> <span class="k">import</span> <span class="n">load_boston</span>
<span class="lineno"> 6 </span><span class="kn">from</span> <span class="nn">sklearn.model_selection</span> <span class="k">import</span> <span class="n">KFold</span>
<span class="lineno"> 7 </span><span class="kn">from</span> <span class="nn">chapter5.lasso</span> <span class="k">import</span> <span class="n">Lasso</span>
<span class="lineno"> 8 </span>
<span class="lineno"> 9 </span>
<span class="lineno">10 </span><span class="n">boston</span> <span class="o">=</span> <span class="n">load_boston</span><span class="p">()</span>
<span class="lineno">11 </span><span class="n">X</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">boston</span><span class="o">.</span><span class="n">data</span><span class="p">,</span> <span class="n">boston</span><span class="o">.</span><span class="n">target</span>
<span class="lineno">12 </span>
<span class="lineno">13 </span><span class="c1"># create the partitions with k=5</span>
<span class="lineno">14 </span><span class="n">k</span> <span class="o">=</span> <span class="mi">5</span>
<span class="lineno">15 </span><span class="n">kf</span> <span class="o">=</span> <span class="n">KFold</span><span class="p">(</span><span class="n">n_splits</span><span class="o">=</span><span class="n">k</span><span class="p">)</span>
<span class="lineno">16 </span><span class="c1"># create a placeholder for the rmse on each test partition</span>
<span class="lineno">17 </span><span class="n">scores</span> <span class="o">=</span> <span class="p">[]</span>
<span class="lineno">18 </span>
<span class="lineno">19 </span><span class="k">for</span> <span class="n">train_index</span><span class="p">,</span> <span class="n">test_index</span> <span class="ow">in</span> <span class="n">kf</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">X</span><span class="p">):</span>
<span class="lineno">20 </span> <span class="n">X_train</span><span class="p">,</span> <span class="n">X_test</span> <span class="o">=</span> <span class="n">X</span><span class="p">[</span><span class="n">train_index</span><span class="p">],</span> <span class="n">X</span><span class="p">[</span><span class="n">test_index</span><span class="p">]</span>
<span class="lineno">21 </span> <span class="n">y_train</span><span class="p">,</span> <span class="n">y_test</span> <span class="o">=</span> <span class="n">y</span><span class="p">[</span><span class="n">train_index</span><span class="p">],</span> <span class="n">y</span><span class="p">[</span><span class="n">test_index</span><span class="p">]</span>
<span class="lineno">22 </span> <span class="c1"># let's train the model on the train partitions</span>
<span class="lineno">23 </span> <span class="n">lr</span> <span class="o">=</span> <span class="n">Lasso</span><span class="p">(</span><span class="mf">200.0</span><span class="p">)</span>
<span class="lineno">24 </span> <span class="n">lr</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">X_train</span><span class="p">,</span> <span class="n">y_train</span><span class="p">,</span> <span class="n">max_iter</span><span class="o">=</span><span class="mi">100</span><span class="p">)</span>
<span class="lineno">25 </span> <span class="c1"># now test on the test partition</span>
<span class="lineno">26 </span> <span class="n">y_hat</span> <span class="o">=</span> <span class="n">lr</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">X_test</span><span class="p">)</span>
<span class="lineno">27 </span> <span class="c1"># we calculate the root of mean squared error (rmse)</span>
<span class="lineno">28 </span> <span class="n">rmse</span> <span class="o">=</span> <span class="n">mean_squared_error</span><span class="p">(</span><span class="n">y_test</span><span class="p">,</span> <span class="n">y_hat</span><span class="p">)</span> <span class="o">**</span> <span class="mf">0.5</span>
<span class="lineno">29 </span> <span class="n">scores</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">rmse</span><span class="p">)</span>
<span class="lineno">30 </span>
<span class="lineno">31 </span><span class="c1"># average rmse from 5-fold cross-validation</span>
<span class="lineno">32 </span><span class="nb">print</span><span class="p">(</span><span class="nb">sum</span><span class="p">(</span><span class="n">scores</span><span class="p">)</span><span class="o">/</span><span class="n">k</span><span class="p">)</span></code></pre></figure><p>Run the code snippets, we have the $5$-fold cross-validation accuracy as follows.</p>
<div class="codewrapper">
<div class="codeleft">
<language>R</language>
<figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="lineno">1 </span>chapter7 <span class="o">$</span>r <span class="o">-</span>f cv.R
<span class="lineno">2 </span><span class="p">[</span><span class="m">1</span><span class="p">]</span> <span class="m">4.978324</span></code></pre></figure></div>
<div class="coderight">
<language>Python</language>
<figure class="highlight"><pre><code class="language-python3" data-lang="python3"><span></span><span class="lineno">1 </span><span class="n">chapter7</span> <span class="err">$</span><span class="n">python3</span><span class="o">.</span><span class="mi">7</span> <span class="n">cv</span><span class="o">.</span><span class="n">py</span>
<span class="lineno">2 </span><span class="mf">5.702339699398128</span></code></pre></figure></div>
</div>
<p>We add the line <code>sys.path.append("..")</code> in the Python code, otherwise it would throw an error because of the <code>import</code> mechanism<sup class="footnote" id="fnr2"><a href="#fn2">2</a></sup>.</p>
<h3>Evaluation metrics</h3>
<p>In the example above, we measure the root of mean squared error (<span class="caps">RMSE</span>) as the accuracy of the linear regression model. There are various metrics to evaluate the accuracy of predictive models.</p>
<ul>
<li>Metrics for regression</li>
</ul>
<p>For regression models, <span class="caps">RMSE</span>, mean absolute error (<span class="caps">MAE</span>)<sup class="footnote" id="fnr3"><a href="#fn3">3</a></sup> and mean absolute percentage error (<span class="caps">MAPE</span>)<sup class="footnote" id="fnr4"><a href="#fn4">4</a></sup> are some of the commonly-used evaluation metrics. You may have heard of the coefficient of determination ($R^2$ or adjusted $R^{2}$) in Statistics. But from predictive modeling perspective, $R^{2}$ is not a metric that evaluates the predictive power of the model since its calculation is based on the training data. But what we are actually interested in is the model performance on the unseen data. In Statistics, goodness of fit is a term to describe how good a model fits the observations, and $R^{2}$ is one of these measures for goodness of fit. In predictive modeling, we care more about the error of the model on the unseen data, which is called generalization error. But of course, it is possible to calculate the counterpart of $R^{2}$ on testing data.</p>
<figcaption class="centerfigcaption">Some metrics for regression models</figcaption>
<table>
<tr>
<td> <strong>metric</strong> </td>
<td> <strong>formula</strong> </td>
</tr>
<tr>
<td> <span class="caps">RMSE</span> </td>
<td> $\sqrt{\frac {\sum_{i=1}^{n} {(\hat{y}_{i}-y})^{2}} n}$ </td>
</tr>
<tr>
<td> <span class="caps">MAE</span> </td>
<td>$ \frac {\sum_{i=1}^{n} {\lvert \hat{y}_{i} – y \rvert}} {n} $ </td>
</tr>
<tr>
<td> <span class="caps">MAPE</span> </td>
<td> $ \frac {1} n \sum_{i=1}^{n} \lvert{\frac {\hat{y}_{i}-y_{i}} {y_{i}}\rvert}$ </td>
</tr>
</table>
<ul>
<li>Metrics for classification</li>
</ul>
<p>The most intuitive metric for classification models is the accuracy, which is the percentage of corrected classified instances. To calculate the accuracy, we need to label each instance to classify. Recall the logistic regression we introduced in Chapter 5, the direct output of a logistic regression are probabilities rather than labels. In that case, we need to convert the probability output to the label for accuracy calculation. For example, consider a classification problem, where the possible labels of each instance is $0$, $1$, $2$ and $3$. If the predictive probabilities of each label are $0.2$, $0.25$, $0.5$, $0.05$ for label $0$, $1$, $2$ and $3$ respectively, then the predictive label is $2$ since its corresponding probability is the largest.</p>
<p>But actually we don’t always care the labels of an instance. For example, a classification model for mortgage default built in a bank may only be used to calculate the expected monetary loss. Another example is the recommendation system that predicts the probabilities which are used for ranking of items. In that case, the model performance could be evaluated by <code>logloss</code>, <code>AUC</code>, etc. using the output probabilities directly. We have seen in Chapter 5 the loss function of logistic regression is the log-likelihood function.</p>
<p>Actually, <code>logloss</code> is just the average evaluated log-likelihood on the testing data, and thus it can also be used for classification models with more than 2 classes (labels) because likelihood function is not restricted to Bernoulli distribution (extended Bernoulli distribution is called categorical distribution<sup class="footnote" id="fnr5"><a href="#fn5">5</a></sup>). Another name of <code>logloss</code> is <code>cross-entropy loss</code>.</p>
<p>In practice, <code>AUC</code> (Area Under the <span class="caps">ROC</span> Curve) is a very popular evaluation metric for binary-class classification problems. <span class="caps">AUC</span> is bounded between 0 and 1. A perfect model leads to an <span class="caps">AUC</span> equal to 1. If a model’s predictions are $100\%$ wrong, the resulted <span class="caps">AUC</span> is equal to 0. But if we know a binary-class classification model always results in $100\%$ wrong predictions, we can instead use $1-\hat{y}$ as the corrected prediction and as a result we will get a perfect model and the <span class="caps">AUC</span> based on the corrected prediction becomes 1. Actually, a model using completely random guess as the prediction leads to an <span class="caps">AUC</span> equal to 0.5. Thus, in practice the evaluated <span class="caps">AUC</span> is usually between 0.5 and 1.</p>
<p>There are also many other metrics, such as recalls, precisions, and F1 score<sup class="footnote" id="fnr6"><a href="#fn6">6</a></sup>.</p>
<p>The selection of evaluation metrics in predictive modeling is important but also subjective. Sometimes we may also need to define a customized evaluation metric.</p>
<p>Many evaluation metrics can be found from the R package <code>Metrics</code> and the Python module <code>sklearn.metrics</code>.</p>
<div class="codewrapper">
<div class="codeleft">
<language>R</language>
<figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="lineno"> 1 </span><span class="o">></span> <span class="kp">set.seed</span><span class="p">(</span><span class="m">42</span><span class="p">)</span>
<span class="lineno"> 2 </span><span class="o">></span> <span class="c1"># regression metrics</span>
<span class="lineno"> 3 </span><span class="o">></span> y <span class="o">=</span> rnorm<span class="p">(</span>n <span class="o">=</span> <span class="m">10</span><span class="p">,</span> mean <span class="o">=</span> <span class="m">10</span><span class="p">,</span> sd <span class="o">=</span> <span class="m">2</span><span class="p">)</span>
<span class="lineno"> 4 </span><span class="o">></span> y
<span class="lineno"> 5 </span> <span class="p">[</span><span class="m">1</span><span class="p">]</span> <span class="m">12.741917</span> <span class="m">8.870604</span> <span class="m">10.726257</span> <span class="m">11.265725</span> <span class="m">10.808537</span> <span class="m">9.787751</span>
<span class="lineno"> 6 </span> <span class="p">[</span><span class="m">7</span><span class="p">]</span> <span class="m">13.023044</span> <span class="m">9.810682</span> <span class="m">14.036847</span> <span class="m">9.874572</span>
<span class="lineno"> 7 </span><span class="o">></span> <span class="c1"># we use random numbers as the predictions</span>
<span class="lineno"> 8 </span><span class="o">></span> y_hat <span class="o">=</span> rnorm<span class="p">(</span>n <span class="o">=</span> <span class="m">10</span><span class="p">,</span> mean <span class="o">=</span> <span class="m">10.5</span><span class="p">,</span> sd <span class="o">=</span> <span class="m">2.2</span><span class="p">)</span>
<span class="lineno"> 9 </span><span class="o">></span> y_hat
<span class="lineno">10 </span> <span class="p">[</span><span class="m">1</span><span class="p">]</span> <span class="m">13.370713</span> <span class="m">15.530620</span> <span class="m">7.444506</span> <span class="m">9.886665</span> <span class="m">10.206693</span> <span class="m">11.899091</span>
<span class="lineno">11 </span> <span class="p">[</span><span class="m">7</span><span class="p">]</span> <span class="m">9.874644</span> <span class="m">4.655798</span> <span class="m">5.130973</span> <span class="m">13.404249</span>
<span class="lineno">12 </span><span class="o">></span>
<span class="lineno">13 </span><span class="o">></span> rmse<span class="p">(</span>actual <span class="o">=</span> y<span class="p">,</span> predicted <span class="o">=</span> y_hat<span class="p">)</span>
<span class="lineno">14 </span><span class="p">[</span><span class="m">1</span><span class="p">]</span> <span class="m">4.364646</span>
<span class="lineno">15 </span><span class="o">></span> mae<span class="p">(</span>actual <span class="o">=</span> y<span class="p">,</span> predicted <span class="o">=</span> y_hat<span class="p">)</span>
<span class="lineno">16 </span><span class="p">[</span><span class="m">1</span><span class="p">]</span> <span class="m">3.540164</span>
<span class="lineno">17 </span><span class="o">></span> mape<span class="p">(</span>actual <span class="o">=</span> y<span class="p">,</span> predicted <span class="o">=</span> y_hat<span class="p">)</span>
<span class="lineno">18 </span><span class="p">[</span><span class="m">1</span><span class="p">]</span> <span class="m">0.3259014</span>
<span class="lineno">19 </span><span class="o">></span> <span class="c1"># classification metrics</span>
<span class="lineno">20 </span><span class="o">></span> y <span class="o">=</span> rbinom<span class="p">(</span>n <span class="o">=</span> <span class="m">10</span><span class="p">,</span> size <span class="o">=</span> <span class="m">1</span><span class="p">,</span> prob<span class="o">=</span><span class="m">0.25</span><span class="p">)</span>
<span class="lineno">21 </span><span class="o">></span> y
<span class="lineno">22 </span> <span class="p">[</span><span class="m">1</span><span class="p">]</span> <span class="m">0</span> <span class="m">0</span> <span class="m">0</span> <span class="m">1</span> <span class="m">0</span> <span class="m">1</span> <span class="m">1</span> <span class="m">0</span> <span class="m">1</span> <span class="m">0</span>
<span class="lineno">23 </span><span class="o">></span> y_hat <span class="o">=</span> runif<span class="p">(</span><span class="m">10</span><span class="p">,</span> <span class="m">0</span><span class="p">,</span> <span class="m">1</span><span class="p">)</span>
<span class="lineno">24 </span><span class="o">></span> logLoss<span class="p">(</span>y<span class="p">,</span> y_hat<span class="p">)</span>
<span class="lineno">25 </span><span class="p">[</span><span class="m">1</span><span class="p">]</span> <span class="m">0.4553994</span>
<span class="lineno">26 </span><span class="o">></span> auc<span class="p">(</span>y<span class="p">,</span> y_hat<span class="p">)</span>
<span class="lineno">27 </span><span class="p">[</span><span class="m">1</span><span class="p">]</span> <span class="m">0.8333333</span></code></pre></figure></div>
<div class="coderight">
<language>Python</language>
<figure class="highlight"><pre><code class="language-python3" data-lang="python3"><span></span><span class="lineno"> 1 </span><span class="o">>>></span> <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="lineno"> 2 </span><span class="o">>>></span> <span class="kn">from</span> <span class="nn">sklearn.metrics</span> <span class="k">import</span> <span class="n">mean_squared_error</span><span class="p">,</span> <span class="n">mean_absolute_error</span><span class="p">,</span> <span class="n">log_loss</span><span class="p">,</span> <span class="n">roc_auc_score</span>
<span class="lineno"> 3 </span><span class="o">>>></span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">seed</span><span class="p">(</span><span class="mi">42</span><span class="p">)</span>
<span class="lineno"> 4 </span><span class="o">>>></span> <span class="c1"># regression metrics</span>
<span class="lineno"> 5 </span><span class="o">...</span>
<span class="lineno"> 6 </span><span class="o">>>></span> <span class="n">y</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="mi">10</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span>
<span class="lineno"> 7 </span><span class="o">>>></span> <span class="n">y_hat</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="mf">10.5</span><span class="p">,</span> <span class="mf">2.2</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span>
<span class="lineno"> 8 </span><span class="o">>>></span> <span class="n">mean_squared_error</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">y_hat</span><span class="p">)</span> <span class="o">**</span> <span class="mf">0.5</span> <span class="c1"># rmse</span>
<span class="lineno"> 9 </span><span class="mf">3.0668667318485165</span>
<span class="lineno">10 </span><span class="o">>>></span> <span class="n">mean_absolute_error</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">y_hat</span><span class="p">)</span> <span class="c1"># mae</span>
<span class="lineno">11 </span><span class="mf">2.1355703394788237</span>
<span class="lineno">12 </span><span class="o">>>></span> <span class="c1"># let's define mape since it's not available</span>
<span class="lineno">13 </span><span class="o">...</span>
<span class="lineno">14 </span><span class="o">>>></span> <span class="k">def</span> <span class="nf">mape</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">y_hat</span><span class="p">):</span> <span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">y</span><span class="o">-</span><span class="n">y_hat</span><span class="p">)</span><span class="o">/</span><span class="n">y_hat</span><span class="p">)</span>
<span class="lineno">15 </span><span class="o">...</span>
<span class="lineno">16 </span><span class="o">>>></span> <span class="n">mape</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">y_hat</span><span class="p">)</span>
<span class="lineno">17 </span><span class="mf">0.292059554974094</span>
<span class="lineno">18 </span><span class="o">>>></span> <span class="c1"># classification metrics</span>
<span class="lineno">19 </span><span class="o">>>></span> <span class="n">y</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">binomial</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">0.25</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span>
<span class="lineno">20 </span><span class="o">>>></span> <span class="n">y_hat</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span>
<span class="lineno">21 </span><span class="o">>>></span> <span class="n">log_loss</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">y_hat</span><span class="p">)</span>
<span class="lineno">22 </span><span class="mf">0.47071363776285635</span>
<span class="lineno">23 </span><span class="o">>>></span> <span class="n">roc_auc_score</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">y_hat</span><span class="p">)</span> <span class="c1"># auc</span>
<span class="lineno">24 </span><span class="mf">0.8095238095238095</span></code></pre></figure></div>
</div>
<h3>Feature engineering & embedding</h3>
<p>According to the explanation of feature engineering<sup class="footnote" id="fnr7"><a href="#fn7">7</a></sup> on wikipedia, feature engineering is the process to use domain knowledge to create new features based on existing features. In reality, it is not rare to see feature engineering leads to better prediction accuracy. And sometimes I use feature engineering too. But I think feature engineering should be less and less useful in the future as the machine learning algorithms become more and more intelligent.</p>
<p>Let’s consider three features $x_{1},x_{2}$ and $x_{3}$ and assume the actual model is specified as $y=f(x_{1},g(x_{2}, x_{3}))$. After all, $y$ is still a function of $x_{1},x_{2}$ and $x_{3}$, and we can write it as $y=h(x_{1},x_{2}, x_{3})$. Thus, even without creating the new feature $x_{4}=g(x_{2}, x_{3})$ explicitly, a universal approximator should be able to learn (i.e., approximate) the function $h$ from the data ideally. This idea is also supported by the Kolmogorov–Arnold representation theorem<sup class="footnote" id="fnr8"><a href="#fn8">8</a></sup> which says any continuous real-valued multivariate functions can be written as a finite composition of continuous functions of a single variable.</p>
<p>$$<br />
\begin{equation}<br />
f(x\sb{1},…,x\sb{m})=\sum_{q=0}^{2m} {\Phi\sb{q}\big(\sum_{p=1} ^{m} \phi_{q,p} (x_{p})}\big) .<br />
\label{eq:ka}<br />
\end{equation}<br />
$$</p>
<p>As of today, since the machine learning algorithms are not that intelligent, it is worth trying feature engineering especially when domain knowledge is available.</p>
<p>If you are familiar with dimension reduction, embedding can be considered as something similar. Dimension reduction aims reducing the dimension of $\boldsymbol{X}$. It sounds interesting and promising if we can transform the high-dimensional dataset into a a low-dimensional dataset and feed the dataset in a low dimension space to the machine learning model. However, I don’t think this is a good idea in general because it is not guaranteed the low-dimensional predictors still keep all the information related to the response variable. Actually, many machine learning models are capable to handle the high-dimensional predictors directly.</p>
<p>Embedding also transform the features into a new space, which usually has a lower dimension. But generally embedding is not done by the traditional dimension reduction techniques (for example, principal component analysis). In natural language process, a word can be embedded into a vector space by word2vec<sup class="footnote" id="fnr9"><a href="#fn9">9</a></sup> (or other techniques). When an instance is associated with an image, we may consider to use autoencoder<sup class="footnote" id="fnr10"><a href="#fn10">10</a></sup> to encode/embed the image into a space with lower dimension, which is usually achieved by (deep) neural networks.</p>
<h3>Collinearity</h3>
<p>Collinearity is one of the cliches in machine learning. For non-linear models, collinearity is usually not a problem. For linear models, I recommend reading this discussion<sup class="footnote" id="fnr11"><a href="#fn11">11</a></sup> to see when it is not a problem.</p>
<h3>Feature selection & parameter tuning</h3>
<p>We have seen how the Lasso solutions of linear models can be used for feature selection in Chapter 5. What about non-linear models? There are some model-specific techniques for feature selection. Also, there is a metaheuristic approach to select features – cross-validation. Specifically, we can try different combinations of the features and use cross-validation to select the set of features which results in the best cross-validation evaluation metric. However, the major problem of this approach is its efficiency. When the number of features is too large, it is impossible to try all different combinations with limited computational resources. Thus, it is better to use the model-specific feature selection techniques in practice.</p>
<p>To tune model parameters, such as the $\lambda$ in Lasso, we can also use cross-validation. But again, the efficiency is our major concern.</p>
<p>Can we have feature selection and parameter tuning done automatically? Actually, automated machine learning has been a hot research topic in both academia and industry.</p>
<h2 id="gbm">Gradient boosting machine</h2>
<h3>Decision tree</h3>
<p>A decision tree consists of a bunch of nodes. In a decision tree there is a node with no parent nodes, which is called root node. The node without any children is called leaf node.</p>
<figure class="text-center">
<img src="figures/tree1.png" alt="A single decision tree." style="display: block;margin: auto;" width="50%">
<figcaption class="centerfigcaption">A single decision tree.</figcaption>
</figure>
<p>The length (number of nodes) of the longest path from the root node to a leaf node is called the depth of the tree. For example, the depth of the tree above is 4. Each leaf node has a label. In regression tasks, the label is a real number, and in classification tasks the label could could be a real number which is used to get the class indirectly (for example, fed into a sigmoid function to get the probability), or an integer representing the predicted class directly.</p>
<p>Each node except the leaves in a decision tree is associated with a splitting rule. These splitting rules determine to which leaf an instance belongs. A rule is just a function taking a feature as input and returns true or false as output. For example, a rule on the root could be $x_{1}<0$ and if it is true, we go to the left node otherwise go to the right node. Once we arrive at a leaf, we can get the predicted value based on the label of the leaf.</p>
<p>To get a closer look, let’s try to implement a binary tree structure for regression tasks in R/Python from scratch.</p>
<p>Let’s implement the binary tree as a recursive data structure, which is composed partially of similar instances of the same data structure. More specifically, a binary tree can be decomposed into three components, i.e., its root node, the left subtree under the root, and the right subtree of the root. To define a binary (decision) tree, we only need to define these three components. And to define the left and right subtrees, this decomposition is applied recursively until the leaves.</p>
<p>Now we have the big picture how to define a binary tree. However, to make the binary tree a decision tree, we also need to define the <br />
splitting rules. For simplicity, we assume there is no missing value in our data and all variables are numeric. Then a splitting rule of a node is composed of two components, i.e., the variable to split on, and the corresponding breakpoint for splitting.</p>
<p>There is one more component we need to define in a decision tree, that is, the predict method which takes an instance as input and returns the prediction.</p>
<p>Now we are ready to define our binary decision tree.</p>
<language>R</language>
<p>chapter7/tree.R<br />
<figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="lineno"> 1 </span><span class="kn">library</span><span class="p">(</span>R6<span class="p">)</span>
<span class="lineno"> 2 </span>Tree <span class="o">=</span> R6Class<span class="p">(</span>
<span class="lineno"> 3 </span> <span class="s">"Tree"</span><span class="p">,</span>
<span class="lineno"> 4 </span> public <span class="o">=</span> <span class="kt">list</span><span class="p">(</span>
<span class="lineno"> 5 </span> left <span class="o">=</span> <span class="kc">NULL</span><span class="p">,</span>
<span class="lineno"> 6 </span> right <span class="o">=</span> <span class="kc">NULL</span><span class="p">,</span>
<span class="lineno"> 7 </span> variable_id <span class="o">=</span> <span class="kc">NULL</span><span class="p">,</span>
<span class="lineno"> 8 </span> break_point <span class="o">=</span> <span class="kc">NULL</span><span class="p">,</span>
<span class="lineno"> 9 </span> val <span class="o">=</span> <span class="kc">NULL</span><span class="p">,</span>
<span class="lineno">10 </span> initialize <span class="o">=</span> <span class="kr">function</span><span class="p">(</span>left<span class="p">,</span> right<span class="p">,</span> variable_id<span class="p">,</span> break_point<span class="p">,</span> val<span class="p">)</span> <span class="p">{</span>
<span class="lineno">11 </span> self<span class="o">$</span>left <span class="o">=</span> left
<span class="lineno">12 </span> self<span class="o">$</span>right <span class="o">=</span> right
<span class="lineno">13 </span> self<span class="o">$</span>variable_id <span class="o">=</span> variable_id
<span class="lineno">14 </span> self<span class="o">$</span>break_point <span class="o">=</span> break_point
<span class="lineno">15 </span> self<span class="o">$</span>val <span class="o">=</span> val
<span class="lineno">16 </span> <span class="p">},</span>
<span class="lineno">17 </span> is_leaf <span class="o">=</span> <span class="kr">function</span><span class="p">()</span> <span class="p">{</span>
<span class="lineno">18 </span> <span class="kp">is.null</span><span class="p">(</span>self<span class="o">$</span>left<span class="p">)</span> <span class="o">&&</span> <span class="kp">is.null</span><span class="p">(</span>self<span class="o">$</span>right<span class="p">)</span>
<span class="lineno">19 </span> <span class="p">},</span>
<span class="lineno">20 </span> depth <span class="o">=</span> <span class="kr">function</span><span class="p">()</span> <span class="p">{</span>
<span class="lineno">21 </span> <span class="kr">if</span> <span class="p">(</span>self<span class="o">$</span>is_leaf<span class="p">())</span> <span class="p">{</span>
<span class="lineno">22 </span> <span class="m">1</span>
<span class="lineno">23 </span> <span class="p">}</span> <span class="kr">else</span> <span class="kr">if</span> <span class="p">(</span><span class="kp">is.null</span><span class="p">(</span>self<span class="o">$</span>left<span class="p">))</span> <span class="p">{</span>
<span class="lineno">24 </span> <span class="m">1</span> <span class="o">+</span> self<span class="o">$</span>right<span class="o">$</span>depth<span class="p">()</span>
<span class="lineno">25 </span> <span class="p">}</span> <span class="kr">else</span> <span class="kr">if</span> <span class="p">(</span><span class="kp">is.null</span><span class="p">(</span>self<span class="o">$</span>right<span class="p">))</span> <span class="p">{</span>
<span class="lineno">26 </span> <span class="m">1</span> <span class="o">+</span> self<span class="o">$</span>left<span class="o">$</span>depth<span class="p">()</span>
<span class="lineno">27 </span> <span class="p">}</span> <span class="kp">else</span><span class="p">{</span>
<span class="lineno">28 </span> <span class="m">1</span> <span class="o">+</span> <span class="kp">max</span><span class="p">(</span>self<span class="o">$</span>left<span class="o">$</span>depth<span class="p">(),</span> self<span class="o">$</span>right<span class="o">$</span>depth<span class="p">())</span>
<span class="lineno">29 </span> <span class="p">}</span>
<span class="lineno">30 </span> <span class="p">},</span>
<span class="lineno">31 </span> predict_single <span class="o">=</span> <span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> <span class="p">{</span>
<span class="lineno">32 </span> <span class="c1"># if x is a vector</span>
<span class="lineno">33 </span> <span class="kr">if</span> <span class="p">(</span>self<span class="o">$</span>is_leaf<span class="p">())</span> <span class="p">{</span>
<span class="lineno">34 </span> self<span class="o">$</span>val
<span class="lineno">35 </span> <span class="p">}</span> <span class="kp">else</span><span class="p">{</span>
<span class="lineno">36 </span> <span class="kr">if</span> <span class="p">(</span>x<span class="p">[</span>self<span class="o">$</span>variable_id<span class="p">]</span> <span class="o"><</span> self<span class="o">$</span>break_point<span class="p">)</span> <span class="p">{</span>
<span class="lineno">37 </span> self<span class="o">$</span>left<span class="o">$</span>predict_single<span class="p">(</span>x<span class="p">)</span>
<span class="lineno">38 </span> <span class="p">}</span> <span class="kp">else</span><span class="p">{</span>
<span class="lineno">39 </span> self<span class="o">$</span>right<span class="o">$</span>predict_single<span class="p">(</span>x<span class="p">)</span>
<span class="lineno">40 </span> <span class="p">}</span>
<span class="lineno">41 </span> <span class="p">}</span>
<span class="lineno">42 </span> <span class="p">},</span>
<span class="lineno">43 </span> predict <span class="o">=</span> <span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> <span class="p">{</span>
<span class="lineno">44 </span> <span class="c1"># if x is an array</span>
<span class="lineno">45 </span> preds <span class="o">=</span> <span class="kp">rep</span><span class="p">(</span><span class="m">0.0</span><span class="p">,</span> <span class="kp">nrow</span><span class="p">(</span>x<span class="p">))</span>
<span class="lineno">46 </span> <span class="kr">for</span> <span class="p">(</span>i <span class="kr">in</span> <span class="m">1</span><span class="o">:</span><span class="kp">nrow</span><span class="p">(</span>x<span class="p">))</span> <span class="p">{</span>
<span class="lineno">47 </span> preds<span class="p">[</span>i<span class="p">]</span> <span class="o">=</span> self<span class="o">$</span>predict_single<span class="p">(</span>x<span class="p">[</span>i<span class="p">,</span> <span class="p">])</span>
<span class="lineno">48 </span> <span class="p">}</span>
<span class="lineno">49 </span> preds
<span class="lineno">50 </span> <span class="p">},</span>
<span class="lineno">51 </span> print <span class="o">=</span> <span class="kr">function</span><span class="p">()</span> <span class="p">{</span>
<span class="lineno">52 </span> <span class="c1"># we can call print(tree), similar to the magic method in Python</span>
<span class="lineno">53 </span> <span class="kp">cat</span><span class="p">(</span><span class="s">"variable_id:"</span><span class="p">,</span> self<span class="o">$</span>variable_id<span class="p">,</span> <span class="s">"\n"</span><span class="p">)</span>
<span class="lineno">54 </span> <span class="kp">cat</span><span class="p">(</span><span class="s">"break at:"</span><span class="p">,</span> self<span class="o">$</span>break_point<span class="p">,</span> <span class="s">"\n"</span><span class="p">)</span>
<span class="lineno">55 </span> <span class="kp">cat</span><span class="p">(</span><span class="s">"is_leaf:"</span><span class="p">,</span> self<span class="o">$</span>is_leaf<span class="p">(),</span> <span class="s">"\n"</span><span class="p">)</span>
<span class="lineno">56 </span> <span class="kp">cat</span><span class="p">(</span><span class="s">"val:"</span><span class="p">,</span> self<span class="o">$</span>val<span class="p">,</span> <span class="s">"\n"</span><span class="p">)</span>
<span class="lineno">57 </span> <span class="kp">cat</span><span class="p">(</span><span class="s">"depth:"</span><span class="p">,</span> self<span class="o">$</span>depth<span class="p">(),</span> <span class="s">"\n"</span><span class="p">)</span>
<span class="lineno">58 </span> <span class="kp">invisible</span><span class="p">(</span>self<span class="p">)</span>
<span class="lineno">59 </span> <span class="p">}</span>
<span class="lineno">60 </span> <span class="p">)</span>
<span class="lineno">61 </span><span class="p">)</span></code></pre></figure></p>
<language>Python</language>
<p>chapter7/tree.py<br />
<figure class="highlight"><pre><code class="language-python3" data-lang="python3"><span></span><span class="lineno"> 1 </span><span class="k">class</span> <span class="nc">Tree</span><span class="p">:</span>
<span class="lineno"> 2 </span> <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">,</span> <span class="n">variable_id</span><span class="p">,</span> <span class="n">break_point</span><span class="p">,</span> <span class="n">val</span><span class="p">):</span>
<span class="lineno"> 3 </span> <span class="bp">self</span><span class="o">.</span><span class="n">left</span> <span class="o">=</span> <span class="n">left</span>
<span class="lineno"> 4 </span> <span class="bp">self</span><span class="o">.</span><span class="n">right</span> <span class="o">=</span> <span class="n">right</span>
<span class="lineno"> 5 </span> <span class="bp">self</span><span class="o">.</span><span class="n">variable_id</span> <span class="o">=</span> <span class="n">variable_id</span>
<span class="lineno"> 6 </span> <span class="bp">self</span><span class="o">.</span><span class="n">break_point</span> <span class="o">=</span> <span class="n">break_point</span>
<span class="lineno"> 7 </span> <span class="bp">self</span><span class="o">.</span><span class="n">val</span> <span class="o">=</span> <span class="n">val</span>
<span class="lineno"> 8 </span>
<span class="lineno"> 9 </span> <span class="nd">@property</span>
<span class="lineno">10 </span> <span class="k">def</span> <span class="nf">is_leaf</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="lineno">11 </span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">left</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">right</span> <span class="ow">is</span> <span class="kc">None</span>
<span class="lineno">12 </span>
<span class="lineno">13 </span> <span class="k">def</span> <span class="nf">_predict_single</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
<span class="lineno">14 </span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_leaf</span><span class="p">:</span>
<span class="lineno">15 </span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">val</span>
<span class="lineno">16 </span> <span class="k">if</span> <span class="n">x</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">variable_id</span><span class="p">]</span> <span class="o"><</span> <span class="bp">self</span><span class="o">.</span><span class="n">break_point</span><span class="p">:</span>
<span class="lineno">17 </span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">left</span><span class="o">.</span><span class="n">_predict_single</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="lineno">18 </span> <span class="k">else</span><span class="p">:</span>
<span class="lineno">19 </span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">right</span><span class="o">.</span><span class="n">_predict_single</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="lineno">20 </span>
<span class="lineno">21 </span> <span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
<span class="lineno">22 </span> <span class="k">return</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_predict_single</span><span class="p">(</span><span class="n">e</span><span class="p">)</span> <span class="k">for</span> <span class="n">e</span> <span class="ow">in</span> <span class="n">x</span><span class="p">]</span>
<span class="lineno">23 </span>
<span class="lineno">24 </span> <span class="nd">@property</span>
<span class="lineno">25 </span> <span class="k">def</span> <span class="nf">depth</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="lineno">26 </span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_leaf</span><span class="p">:</span>
<span class="lineno">27 </span> <span class="k">return</span> <span class="mi">1</span>
<span class="lineno">28 </span> <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">left</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="lineno">29 </span> <span class="k">return</span> <span class="mi">1</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">right</span><span class="o">.</span><span class="n">depth</span>
<span class="lineno">30 </span> <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">right</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="lineno">31 </span> <span class="k">return</span> <span class="mi">1</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">left</span><span class="o">.</span><span class="n">depth</span>
<span class="lineno">32 </span> <span class="k">return</span> <span class="mi">1</span> <span class="o">+</span> <span class="nb">max</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">left</span><span class="o">.</span><span class="n">depth</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">right</span><span class="o">.</span><span class="n">depth</span><span class="p">)</span>
<span class="lineno">33 </span>
<span class="lineno">34 </span> <span class="k">def</span> <span class="nf">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="lineno">35 </span> <span class="k">return</span> <span class="s2">"variable_id: </span><span class="si">{0}</span><span class="se">\n</span><span class="s2">break_at: </span><span class="si">{1}</span><span class="se">\n</span><span class="s2">val: </span><span class="si">{2}</span><span class="se">\n</span><span class="s2">is_leaf: </span><span class="si">{3}</span><span class="se">\n</span><span class="s2">height: </span><span class="si">{4}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="lineno">36 </span> <span class="bp">self</span><span class="o">.</span><span class="n">variable_id</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">break_point</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">val</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_leaf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">depth</span><span class="p">)</span></code></pre></figure></p>
<p>You may have noticed the usage <code>@property</code> in our Python implementation. It is one of builtin decorators in Python. We won’t talk too much of decorators. Basically, adding this decorator makes the method <code>depth</code> behave like a property, in the sense that we can call <code>self.depth</code> instead of <code>self.depth()</code> to get the depth.</p>
<p>In the R implementation, the <code>invisible(self)</code> is returned in the print method which seems strange. It is an issue<sup class="footnote" id="fnr12"><a href="#fn12">12</a></sup> of <code>R6</code> class due to the <code>S3</code> dispatch mechanism which is not introduced in this book.</p>
<p>The above implementation doesn’t involve the training or fitting of the decision tree. In this book, we wouldn’t talk about how to fit a traditional decision tree model due to its limited usage in the context of modern machine learning. Let’s see how to use the decision tree structures we defined above by creating a pseudo decision tree illustrated below.</p>
<figure class="text-center">
<img src="figures/tree2.png" alt="" style="display: block;margin: auto;" width="75%">
<figcaption class="centerfigcaption"></figcaption>
</figure>
<div class="codewrapper">
<div class="codeleft">
<language>R</language>
<figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="lineno"> 1 </span><span class="o">></span> <span class="kn">source</span><span class="p">(</span><span class="s">'tree.R'</span><span class="p">)</span>
<span class="lineno"> 2 </span><span class="o">></span> node_2 <span class="o">=</span> Tree<span class="o">$</span>new<span class="p">(</span><span class="kc">NULL</span><span class="p">,</span> <span class="kc">NULL</span><span class="p">,</span> <span class="kc">NULL</span><span class="p">,</span> <span class="kc">NULL</span><span class="p">,</span> <span class="m">0.6</span><span class="p">)</span>
<span class="lineno"> 3 </span><span class="o">></span> node_3 <span class="o">=</span> Tree<span class="o">$</span>new<span class="p">(</span><span class="kc">NULL</span><span class="p">,</span> <span class="kc">NULL</span><span class="p">,</span> <span class="kc">NULL</span><span class="p">,</span> <span class="kc">NULL</span><span class="p">,</span> <span class="m">2.4</span><span class="p">)</span>
<span class="lineno"> 4 </span><span class="o">></span> node_4 <span class="o">=</span> Tree<span class="o">$</span>new<span class="p">(</span><span class="kc">NULL</span><span class="p">,</span> <span class="kc">NULL</span><span class="p">,</span> <span class="kc">NULL</span><span class="p">,</span> <span class="kc">NULL</span><span class="p">,</span> <span class="m">4.5</span><span class="p">)</span>
<span class="lineno"> 5 </span><span class="o">></span> node_1 <span class="o">=</span> Tree<span class="o">$</span>new<span class="p">(</span>node_3<span class="p">,</span> node_4<span class="p">,</span> <span class="m">1</span><span class="p">,</span> <span class="m">10.5</span><span class="p">,</span> <span class="kc">NULL</span><span class="p">)</span>
<span class="lineno"> 6 </span><span class="o">></span> node_0 <span class="o">=</span> Tree<span class="o">$</span>new<span class="p">(</span>node_1<span class="p">,</span> node_2<span class="p">,</span> <span class="m">2</span><span class="p">,</span> <span class="m">3.2</span><span class="p">,</span> <span class="kc">NULL</span><span class="p">)</span>
<span class="lineno"> 7 </span><span class="o">></span> <span class="kp">print</span><span class="p">(</span>node_0<span class="p">)</span>
<span class="lineno"> 8 </span>variable_id<span class="o">:</span> <span class="m">2</span>
<span class="lineno"> 9 </span><span class="kr">break</span> at<span class="o">:</span> <span class="m">3.2</span>
<span class="lineno">10 </span>is_leaf<span class="o">:</span> <span class="kc">FALSE</span>
<span class="lineno">11 </span>val<span class="o">:</span>
<span class="lineno">12 </span>depth<span class="o">:</span> <span class="m">3</span>
<span class="lineno">13 </span><span class="o">></span> <span class="kp">print</span><span class="p">(</span>node_4<span class="p">)</span>
<span class="lineno">14 </span>variable_id<span class="o">:</span>
<span class="lineno">15 </span><span class="kr">break</span> at<span class="o">:</span>
<span class="lineno">16 </span>is_leaf<span class="o">:</span> <span class="kc">TRUE</span>
<span class="lineno">17 </span>val<span class="o">:</span> <span class="m">4.5</span>
<span class="lineno">18 </span>depth<span class="o">:</span> <span class="m">1</span>
<span class="lineno">19 </span><span class="o">></span> x <span class="o">=</span> <span class="kt">array</span><span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="m">10</span><span class="p">,</span> <span class="m">0.5</span><span class="p">),</span> <span class="kt">c</span><span class="p">(</span><span class="m">1</span><span class="p">,</span> <span class="m">2</span><span class="p">))</span>
<span class="lineno">20 </span><span class="o">></span> node_0<span class="o">$</span>predict<span class="p">(</span>x<span class="p">)</span>
<span class="lineno">21 </span><span class="p">[</span><span class="m">1</span><span class="p">]</span> <span class="m">2.4</span></code></pre></figure></div>
<div class="coderight">
<language>Python</language>
<figure class="highlight"><pre><code class="language-python3" data-lang="python3"><span></span><span class="lineno"> 1 </span><span class="o">>>></span> <span class="kn">from</span> <span class="nn">tree</span> <span class="k">import</span> <span class="n">Tree</span>
<span class="lineno"> 2 </span><span class="o">>>></span> <span class="n">node_2</span> <span class="o">=</span> <span class="n">Tree</span><span class="p">(</span><span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="mf">0.6</span><span class="p">)</span>
<span class="lineno"> 3 </span><span class="o">>>></span> <span class="n">node_3</span> <span class="o">=</span> <span class="n">Tree</span><span class="p">(</span><span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="mf">2.4</span><span class="p">)</span>
<span class="lineno"> 4 </span><span class="o">>>></span> <span class="n">node_4</span> <span class="o">=</span> <span class="n">Tree</span><span class="p">(</span><span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="mf">4.5</span><span class="p">)</span>
<span class="lineno"> 5 </span><span class="o">>>></span> <span class="n">node_1</span> <span class="o">=</span> <span class="n">Tree</span><span class="p">(</span><span class="n">node_3</span><span class="p">,</span> <span class="n">node_4</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mf">10.5</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="lineno"> 6 </span><span class="o">>>></span> <span class="n">node_0</span> <span class="o">=</span> <span class="n">Tree</span><span class="p">(</span><span class="n">node_1</span><span class="p">,</span> <span class="n">node_2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mf">3.2</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="lineno"> 7 </span><span class="o">>>></span>
<span class="lineno"> 8 </span><span class="o">>>></span> <span class="nb">print</span><span class="p">(</span><span class="n">node_0</span><span class="p">)</span>
<span class="lineno"> 9 </span><span class="n">variable_id</span><span class="p">:</span> <span class="mi">1</span>
<span class="lineno">10 </span><span class="n">break_at</span><span class="p">:</span> <span class="mf">3.2</span>
<span class="lineno">11 </span><span class="n">val</span><span class="p">:</span> <span class="kc">None</span>
<span class="lineno">12 </span><span class="n">is_leaf</span><span class="p">:</span> <span class="kc">False</span>
<span class="lineno">13 </span><span class="n">depth</span><span class="p">:</span> <span class="mi">3</span>
<span class="lineno">14 </span><span class="o">>>></span> <span class="nb">print</span><span class="p">(</span><span class="n">node_4</span><span class="p">)</span>
<span class="lineno">15 </span><span class="n">variable_id</span><span class="p">:</span> <span class="kc">None</span>
<span class="lineno">16 </span><span class="n">break_at</span><span class="p">:</span> <span class="kc">None</span>
<span class="lineno">17 </span><span class="n">val</span><span class="p">:</span> <span class="mf">4.5</span>
<span class="lineno">18 </span><span class="n">is_leaf</span><span class="p">:</span> <span class="kc">True</span>
<span class="lineno">19 </span><span class="n">depth</span><span class="p">:</span> <span class="mi">1</span>
<span class="lineno">20 </span><span class="o">>>></span> <span class="n">x</span> <span class="o">=</span> <span class="p">[[</span><span class="mi">10</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">]]</span>
<span class="lineno">21 </span><span class="o">>>></span> <span class="n">node_0</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="lineno">22 </span><span class="p">[</span><span class="mf">2.4</span><span class="p">]</span></code></pre></figure></div>
</div>
<p>It’s worth noting decision trees can approximate universally.</p>
<h3>Tree growing in gradient boosting machine</h3>
<p>What is a gradient boosting machine (<span class="caps">GBM</span>) (or gradient boosting regression)? Essentially, a <span class="caps">GBM</span> is just a forest of decision trees.<br />
If you have heard of random forest (RF), you may know that a random forest is also a bunch of trees. What is the difference between a <span class="caps">GBM</span> and RF?</p>
<p>Looking at the fitted trees from RF and <span class="caps">GBM</span>, there is no way to tell if the trees are fitted by a RF or a <span class="caps">GBM</span>. The major difference is how these trees are trained, rather than the trees themselves. A minor difference is how these trees are used for prediction. In many RF implementations, the prediction for an instance is the average prediction of each tree within the forest. If it is a classification task, there are two ways to get the final prediction – (a) predict the class with majority voting directly, i.e., the predicted class is the one with highest frequency among the predicted classes of all trees; (b) predict the probability based on the frequencies, for example, if among five trees there are three trees output class 1 then the predicted probability of class 1 is equal to $3/5$. In many <span class="caps">GBM</span> implementations, the prediction (for regression tasks) is the sum of the predictions of individual trees.</p>
<p><span class="caps">GBM</span> fits trees sequentially, but RF fits trees independently. The obvious advantage of fitting trees independently is that it can be done in parallel. But accuracy-wise, <span class="caps">GBM</span> usually performs better according to my limited experience.</p>
<p>We have seen the structure of a single decision tree in <span class="caps">GBM</span>. Now it’s time to see how to get these trees fitted in <span class="caps">GBM</span>. Let’s start from the first tree.</p>
<p>To grow a tree, we start from its root node. In <span class="caps">GBM</span> fitting, usually we pre-determine a maximum depth $d$ for each tree to grow. And the final tree’s depth may be equal to or less than the maximum depth $d$. At a high level, the tree is grown in a recursive fashion. Specifically, first we attempt to split the current node and if the splitting improves the performance we grow the left subtree and the right subtree under the root node. When we grow the left subtree, its maximum depth is $d-1$, and the same applies to the right subtree. We can define a tree grow function for such purpose which takes a root node $Node_{root}$ (it is also a leaf) as input. The pseudo code of tree grow function is illustrated below.</p>
<ul>
<li>if $d>1$:<br />
- call split function on $Node_{root}$<br />
- if true is returned:
<ul>
<li>call grow function on the empty $Node_{left}$ with $d-1$ maximum depth</li>
<li>call grow function on the empty $Node_{right}$ with $d-1$ maximum depth</li>
</ul></li>
<li>return</li>
</ul>
<p>In fact, the algorithm of tree growing is just a <span class="caps">DFS</span> algorithm.</p>
<p>To complete the pseudo algorithm above, we need to have a split function which takes a leaf node as input and returns a boolean value as output. If true is returned, we will do the splitting, i.e., to grow the left/right subtree. So now the challenge is how to define the split function, which requires the understanding of the theories behind <span class="caps">GBM</span>.</p>
<h3>Optimization of <span class="caps">GBM</span></h3>
<p>Similar to other regression models we have seen so far, <span class="caps">GBM</span> with $K$ trees has a loss function which is defined below.</p>
<p>$$<br />
\begin{equation}<br />
\mathcal{L}=\sum_{i=1}^{n} {(y_{i}-\hat{y}_{i})^{2}},<br />
\label{eq:gbm0}<br />
\end{equation}<br />
$$<br />
where</p>
<p>$$<br />
\begin{equation}<br />
\hat{y}_{i} = \sum_{t=1}^{K} {f_{t}(\boldsymbol{x}_{i})}.<br />
\label{eq:treepred}<br />
\end{equation}<br />
$$</p>
<p>$f_{t}$ denotes the prediction of $t^{th}$ tree in the forest. As we mentioned previously, the fitting is done sequentially. When we fit the $t^{th}$ tree, all the previous $t-1$ trees are fixed. And the loss function for fitting the $t^{th}$ tree is given below.</p>
<p>$$<br />
\begin{equation}<br />
\mathcal{L}^{(t)}=\sum_{i=1}^{n} {(y_{i}- \sum_{l=1}^{t-1} {f_{l}(\boldsymbol{x}_{i})} – f_{t}(\boldsymbol{x}_{i}))^{2}}<br />
\label{eq:gbm_t}<br />
\end{equation}<br />
$$</p>
<p>In practice, a regularized loss function \eqref{eq:treepred1} is used instead of \eqref{eq:treepred} to reduce overfitting.</p>
<p>$$<br />
\begin{equation}<br />
\mathcal{L}^{(t)}=\sum_{i=1}^{n} {(y_{i}- \sum_{l=1}^{t-1} {f_{l}(\boldsymbol{x}_{i})} – f_{t}(\boldsymbol{x}_{i}))^{2}} + \Phi(f_{t}).<br />
\label{eq:treepred1}<br />
\end{equation}<br />
$$</p>
<p>Let’s follow the paper<sup class="footnote" id="fnr13"><a href="#fn13">13</a></sup> and use the number of leaves as well as the L2 penalty of the values (also called weights) of the leaves for regularization. The loss function then becomes</p>
<p>$$<br />
\begin{equation}<br />
\mathcal{L}^{(t)}=\sum_{i=1}^{n} {(y_{i}- \sum_{l=1}^{t-1} {f_{l}(\boldsymbol{x}_{i})} – f_{t}(\boldsymbol{x}_{i}))^{2}} + \gamma T + \frac 1 2 {\lambda \sum_{j=1}^{T}{\omega_{j}^{2}}} ,<br />
\label{eq:growtree0}<br />
\end{equation}<br />
$$<br />
where $\omega_{j}$ is the value associated with the $j_{th}$ leaf of the current tree.</p>
<p>Again, we get an optimization problem, i.e., to minimize the loss function \eqref{eq:growtree0}. The minimization problem can also be viewed as a quadratic programming problem. However, it seems different from the other optimization problems we have seen before, in the sense that the decision tree $f_{t}$ is a non-parametric model. A model is non-parametric if the model structure is learned from the data rather than pre-determined.</p>
<p>A common approach used in <span class="caps">GBM</span> is the second order approximation. By second order approximation, the loss function becomes</p>
<p>$$<br />
\begin{equation}<br />
\mathcal{L}^{(t)}\approx \sum_{i=1}^{n} {(y_{i} – \sum_{l=1}^{t-1} {f_{l}(\boldsymbol{x}_{i})})^2} + \sum_{i=1}^{n} {(g_{i}f_{t}(\boldsymbol{x}_{i}) +\frac 1 2 {h_{i}f_{t}(\boldsymbol{x}_{i})^{2}})} + \gamma T + \frac 1 2 {\lambda \sum_{j=1}^{T}{\omega_{j}^{2}}} ,<br />
\label{eq:growtree}<br />
\end{equation}<br />
$$<br />
where $g\sb{i}=2(f\sb{t}(\boldsymbol{x}\sb{i}) + \sum\sb{l=1}^{t-1} {f\sb{l}(\boldsymbol{x}_{i})} – y_{i})$ and $h\sb{i}=2$ are the first and the second order derivatives of the function $(y\sb{i}- \sum_{l=1}^{t-1} {f\sb{l}(\boldsymbol{x}\sb{i})-f\sb{t}(\boldsymbol{x}\sb{i}))^{2}} $ with respect to the function $f_{t}(\boldsymbol{x}_{i})$.</p>
<p>Let’s implement the function to calculate $g$ and $h$ and put them into <code>util.py</code>.</p>
<language>Python</language>
<p>chapter7/util.py<br />
<figure class="highlight"><pre><code class="language-python3" data-lang="python3"><span></span><span class="lineno">1 </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="lineno">2 </span>
<span class="lineno">3 </span><span class="k">def</span> <span class="nf">gh_lm</span><span class="p">(</span><span class="n">actual</span><span class="p">,</span> <span class="n">pred</span><span class="p">):</span>
<span class="lineno">4 </span> <span class="sd">'''</span>
<span class="lineno">5 </span><span class="sd"> gradient and hessian for linear regression loss</span>
<span class="lineno">6 </span><span class="sd"> '''</span>
<span class="lineno">7 </span> <span class="k">return</span> <span class="mi">2</span><span class="o">*</span><span class="p">(</span><span class="n">pred</span><span class="o">-</span><span class="n">actual</span><span class="p">),</span> <span class="mf">2.0</span></code></pre></figure></p>
<p>Since the first item is a constant, let’s ignore it.</p>
<p>$$<br />
\begin{equation}<br />
\mathcal{L}^{(t)}\approx\sum_{i=1}^{n} {(g_{i}f_{t}(\boldsymbol{x}_{i}) +\frac 1 2 {h_{i}f_{t}(\boldsymbol{x}_{i})^{2}})} + \gamma T + \frac 1 2 {\lambda \sum_{j=1}^{T}{\omega_{j}^{2}}} ).<br />
\label{eq:growtree1}<br />
\end{equation}<br />
$$</p>
<p>Let’s think of the prediction of an instance of the current tree. The training data, i.e., instances fall under the leaves of a tree. Thus, the prediction of an instance is the value $\omega$ associated with the corresponding leaf that the instance belongs to. Based on this fact, the loss function can be further rewritten as follows,</p>
<p>$$<br />
\begin{equation}<br />
\mathcal{L}^{(t)} \approx \sum\sb{j=1}^{T} {(\omega \sb{j} \sum\sb{i\in I\sb{j} }{g\sb{i}} + \frac 1 2 {\omega \sb{j} ^{2}( \sum\sb{i\in I\sb{j}} h\sb{i} +\lambda ) }}) + \gamma T .<br />
\label{eq:growtree2}<br />
\end{equation}<br />
$$</p>
<p>When the structure of the tree is fixed the loss function \eqref{eq:growtree2} is a quadratic convex function of $\omega_{j}$, and the optimal solution can be obtained by setting the derivative to zero.</p>
<p>$$<br />
\begin{equation}<br />
\omega_{j}=- \frac {\sum_{i\in I_{j} } g_{i}} {\sum_{i\in I_{j}} h_{i} +\lambda}<br />
\label{eq:omega}<br />
\end{equation}<br />
$$</p>
<p>Plugging \eqref{eq:omega} into the loss function results in the minimal loss of the current tree structure</p>
<p>$$<br />
\begin{equation}<br />
-\frac 1 2 \sum_{j=1}^{T} {\frac {(\sum_{i\in I_{j} }g_{i})^{2}} {\sum_{i\in I_{j}}h_{i} +\lambda} } + \gamma T .<br />
\label{eq:minimalloss}<br />
\end{equation}<br />
$$</p>
<p>Now let’s go back to the split function required in the tree grow function discussed previously. How to determine if we need a splitting on a leaf? \eqref{eq:minimalloss} gives the solution – we can calculate the loss reduction after splitting which is given below</p>
<p>$$<br />
\begin{equation}<br />
\frac 1 2 {\bigg(\frac {(\sum\sb{i\in I\sb{left} }g\sb{i})^{2}} {\sum\sb{i\in I\sb{left}}h\sb{i}+ \lambda } + \frac {(\sum\sb{i\in I\sb{right} }g\sb{i})^{2}} {\sum\sb{i\in I\sb{right}}h\sb{i} + \lambda} – \frac {(\sum\sb{i\in I } g\sb{i})^{2}} {\sum\sb{i\in I} h\sb{i}+\lambda} \bigg)} – \gamma .<br />
\label{eq:lossreduction}<br />
\end{equation}<br />
$$</p>
<p>If the loss reduction is positive, the split function returns true otherwise returns false.</p>
<p>So far, we have a few ingredients ready to implement our own <span class="caps">GBM</span>, which are listed below,</p>
<ul>
<li>the structure of a single decision tree in the forest, i.e., the <code>Tree</code> class defined;</li>
<li>the node splitting mechanism, i.e., \eqref{eq:lossreduction};</li>
<li>the tree growing mechanism, i.e., the pseudo algorithm with the leaf value calculation \eqref{eq:omega}.</li>
</ul>
<p>However, there are a few additional items we need to go through before the implementation.</p>
<p>In Chapter 6, we have seen how stochasticity works in iterative optimization algorithms. The stochasticity can be applied in both instance-wise and feature-wise. The stochasticity technique is very important is the optimization of <span class="caps">GBM</span>. More specifically, we apply the stochasticity both instance-wise and feature-wise. Instance-wise, when we fit a new tree, we randomly get a subsample from the training sample. And feature-wise, we randomly select a few features/variables when we fit a new tree. The stochasticity technique could help to reduce overfitting. The extent of the stochasticity can be controlled by arguments.</p>
<p>Like the gradient decent algorithm, in <span class="caps">GBM</span> there is also a learning rate parameter which scales the values of the leaves after a tree is fitted.</p>
<p>In practice, we may also not want to have too few instances under a leaf, to reduce potential overfitting. When there are two few instances under a leaf, we may just stop the splitting process.</p>
<p>Now we have almost all the ingredients to make a working <span class="caps">GBM</span>. Let’s define the <code>split_node</code> function in the code snippet below.</p>
<language>Python</language>
<p>chapter7/grow.py<br />
<figure class="highlight"><pre><code class="language-python3" data-lang="python3"><span></span><span class="lineno"> 1 </span><span class="kn">from</span> <span class="nn">tree</span> <span class="k">import</span> <span class="n">Tree</span>
<span class="lineno"> 2 </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="lineno"> 3 </span><span class="kn">import</span> <span class="nn">pdb</span>
<span class="lineno"> 4 </span>
<span class="lineno"> 5 </span>
<span class="lineno"> 6 </span><span class="k">def</span> <span class="nf">split_node</span><span class="p">(</span><span class="n">f_in_tree</span><span class="p">,</span> <span class="n">x_in_node</span><span class="p">,</span> <span class="n">x_val_sorted</span><span class="p">,</span> <span class="n">x_index_sorted</span><span class="p">,</span> <span class="n">g_tilde</span><span class="p">,</span> <span class="n">h_tilde</span><span class="p">,</span> <span class="n">lam</span><span class="p">,</span> <span class="n">gamma</span><span class="p">,</span> <span class="n">min_instances</span><span class="p">):</span>
<span class="lineno"> 7 </span> <span class="sd">'''</span>
<span class="lineno"> 8 </span><span class="sd"> f_in_tree: a list of booleans indicating which variable/feature is selected in the tree</span>
<span class="lineno"> 9 </span><span class="sd"> x_in_ndoe: a list of booleans indicating which instance is used in the tree</span>
<span class="lineno"> 10 </span><span class="sd"> x_val_sorted: a nested list, x_val_sorted[feature index] is a list of instance values </span>
<span class="lineno"> 11 </span><span class="sd"> x_index_sorted: a nested list, x_index_sorted[feature index] is a list of instance indexes</span>
<span class="lineno"> 12 </span><span class="sd"> g_tilde: first order derivative</span>
<span class="lineno"> 13 </span><span class="sd"> h_tilde: second order derivative</span>
<span class="lineno"> 14 </span><span class="sd"> lam: lambda for regularization</span>
<span class="lineno"> 15 </span><span class="sd"> gamma: gamma for regularization</span>
<span class="lineno"> 16 </span><span class="sd"> min_instances: the minimal number of instances under a leaf</span>
<span class="lineno"> 17 </span><span class="sd"> at the beginning we assume all instances are on the right</span>
<span class="lineno"> 18 </span><span class="sd"> '''</span>
<span class="lineno"> 19 </span> <span class="k">if</span> <span class="nb">sum</span><span class="p">(</span><span class="n">x_in_node</span><span class="p">)</span> <span class="o"><</span> <span class="n">min_instances</span><span class="p">:</span>
<span class="lineno"> 20 </span> <span class="k">return</span> <span class="kc">False</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span>
<span class="lineno"> 21 </span> <span class="n">best_break</span> <span class="o">=</span> <span class="mf">0.0</span>
<span class="lineno"> 22 </span> <span class="n">best_feature</span><span class="p">,</span> <span class="n">best_location</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span>
<span class="lineno"> 23 </span> <span class="n">ncol</span><span class="p">,</span> <span class="n">nrow</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">f_in_tree</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="n">x_in_node</span><span class="p">)</span>
<span class="lineno"> 24 </span> <span class="n">g</span><span class="p">,</span> <span class="n">h</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span>
<span class="lineno"> 25 </span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">e</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">x_in_node</span><span class="p">):</span>
<span class="lineno"> 26 </span> <span class="k">if</span> <span class="n">e</span><span class="p">:</span>
<span class="lineno"> 27 </span> <span class="n">g</span> <span class="o">+=</span> <span class="n">g_tilde</span><span class="p">[</span><span class="n">i</span><span class="p">]</span>
<span class="lineno"> 28 </span> <span class="n">h</span> <span class="o">+=</span> <span class="n">h_tilde</span><span class="p">[</span><span class="n">i</span><span class="p">]</span>
<span class="lineno"> 29 </span> <span class="n">base_score</span> <span class="o">=</span> <span class="n">g</span><span class="o">*</span><span class="n">g</span><span class="o">/</span><span class="p">(</span><span class="n">h</span><span class="o">+</span><span class="n">lam</span><span class="p">)</span>
<span class="lineno"> 30 </span> <span class="n">score_reduction</span> <span class="o">=</span> <span class="o">-</span><span class="n">np</span><span class="o">.</span><span class="n">inf</span>
<span class="lineno"> 31 </span> <span class="n">best_w_left</span><span class="p">,</span> <span class="n">best_w_right</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span>
<span class="lineno"> 32 </span> <span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">ncol</span><span class="p">):</span>
<span class="lineno"> 33 </span> <span class="k">if</span> <span class="n">f_in_tree</span><span class="p">[</span><span class="n">k</span><span class="p">]:</span>
<span class="lineno"> 34 </span> <span class="c1"># if the feature is selected for this tree</span>
<span class="lineno"> 35 </span> <span class="n">best_n_left_k</span> <span class="o">=</span> <span class="mi">0</span>
<span class="lineno"> 36 </span> <span class="n">n_left_k</span> <span class="o">=</span> <span class="mi">0</span>
<span class="lineno"> 37 </span> <span class="c1"># feature is in the current tree</span>
<span class="lineno"> 38 </span> <span class="n">g_left</span><span class="p">,</span> <span class="n">h_left</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span>
<span class="lineno"> 39 </span> <span class="n">g_right</span><span class="p">,</span> <span class="n">h_right</span> <span class="o">=</span> <span class="n">g</span><span class="o">-</span><span class="n">g_left</span><span class="p">,</span> <span class="n">h</span><span class="o">-</span><span class="n">h_left</span>
<span class="lineno"> 40 </span> <span class="c1"># score reduction for current feature k</span>
<span class="lineno"> 41 </span> <span class="n">score_reduction_k</span> <span class="o">=</span> <span class="o">-</span><span class="n">np</span><span class="o">.</span><span class="n">inf</span>
<span class="lineno"> 42 </span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">nrow</span><span class="p">):</span>
<span class="lineno"> 43 </span> <span class="c1"># for each in sample, we try to split on it</span>
<span class="lineno"> 44 </span> <span class="n">index</span> <span class="o">=</span> <span class="n">x_index_sorted</span><span class="p">[</span><span class="n">k</span><span class="p">][</span><span class="n">i</span><span class="p">]</span>
<span class="lineno"> 45 </span> <span class="k">if</span> <span class="n">x_in_node</span><span class="p">[</span><span class="n">index</span><span class="p">]:</span>
<span class="lineno"> 46 </span> <span class="n">n_left_k</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="lineno"> 47 </span> <span class="n">best_n_left_k</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="lineno"> 48 </span> <span class="n">g_left</span> <span class="o">+=</span> <span class="n">g_tilde</span><span class="p">[</span><span class="n">index</span><span class="p">]</span>
<span class="lineno"> 49 </span> <span class="n">g_right</span> <span class="o">=</span> <span class="n">g</span><span class="o">-</span><span class="n">g_left</span>
<span class="lineno"> 50 </span> <span class="n">h_left</span> <span class="o">+=</span> <span class="n">h_tilde</span><span class="p">[</span><span class="n">index</span><span class="p">]</span>
<span class="lineno"> 51 </span> <span class="n">h_right</span> <span class="o">=</span> <span class="n">h</span><span class="o">-</span><span class="n">h_left</span>
<span class="lineno"> 52 </span> <span class="c1"># new score reduction</span>
<span class="lineno"> 53 </span> <span class="n">score_reduction_k_i</span> <span class="o">=</span> <span class="n">g_left</span><span class="o">*</span><span class="n">g_left</span><span class="o">/</span><span class="p">(</span><span class="n">h_left</span><span class="o">+</span><span class="n">lam</span><span class="p">)</span> <span class="o">+</span> \
<span class="lineno"> 54 </span> <span class="p">(</span><span class="n">g_right</span><span class="o">*</span><span class="n">g_right</span><span class="p">)</span><span class="o">/</span><span class="p">(</span><span class="n">h_right</span><span class="o">+</span><span class="n">lam</span><span class="p">)</span><span class="o">-</span><span class="n">base_score</span>
<span class="lineno"> 55 </span> <span class="k">if</span> <span class="n">score_reduction_k</span> <span class="o"><=</span> <span class="n">score_reduction_k_i</span><span class="p">:</span>
<span class="lineno"> 56 </span> <span class="n">best_n_left_k</span> <span class="o">=</span> <span class="n">n_left_k</span>
<span class="lineno"> 57 </span> <span class="n">best_break_k</span> <span class="o">=</span> <span class="n">x_val_sorted</span><span class="p">[</span><span class="n">k</span><span class="p">][</span><span class="n">i</span><span class="p">]</span>
<span class="lineno"> 58 </span> <span class="n">best_location_k</span> <span class="o">=</span> <span class="n">i</span>
<span class="lineno"> 59 </span> <span class="n">score_reduction_k</span> <span class="o">=</span> <span class="n">score_reduction_k_i</span>
<span class="lineno"> 60 </span> <span class="n">w_left_k</span> <span class="o">=</span> <span class="o">-</span><span class="n">g_left</span><span class="o">/</span><span class="p">(</span><span class="n">h_left</span><span class="o">+</span><span class="n">lam</span><span class="p">)</span>
<span class="lineno"> 61 </span> <span class="n">w_right_k</span> <span class="o">=</span> <span class="o">-</span><span class="n">g_right</span><span class="o">/</span><span class="p">(</span><span class="n">h_right</span><span class="o">+</span><span class="n">lam</span><span class="p">)</span>
<span class="lineno"> 62 </span>
<span class="lineno"> 63 </span> <span class="c1"># if the score reduction on feature k is a better candidate</span>
<span class="lineno"> 64 </span> <span class="k">if</span> <span class="n">score_reduction_k</span> <span class="o">>=</span> <span class="n">score_reduction</span><span class="p">:</span>
<span class="lineno"> 65 </span> <span class="n">score_reduction</span> <span class="o">=</span> <span class="n">score_reduction_k</span>
<span class="lineno"> 66 </span> <span class="n">best_feature</span> <span class="o">=</span> <span class="n">k</span>
<span class="lineno"> 67 </span> <span class="n">best_break</span> <span class="o">=</span> <span class="n">best_break_k</span>
<span class="lineno"> 68 </span> <span class="n">best_location</span> <span class="o">=</span> <span class="n">best_location_k</span>
<span class="lineno"> 69 </span> <span class="n">best_w_left</span> <span class="o">=</span> <span class="n">w_left_k</span>
<span class="lineno"> 70 </span> <span class="n">best_w_right</span> <span class="o">=</span> <span class="n">w_right_k</span>
<span class="lineno"> 71 </span> <span class="k">return</span> <span class="mf">0.5</span><span class="o">*</span><span class="n">score_reduction</span> <span class="o">>=</span> <span class="n">gamma</span><span class="p">,</span> <span class="n">best_feature</span><span class="p">,</span> <span class="n">best_break</span><span class="p">,</span> <span class="n">best_location</span><span class="p">,</span> <span class="n">best_w_left</span><span class="p">,</span> <span class="n">best_w_right</span><span class="p">,</span> <span class="n">score_reduction</span>
<span class="lineno"> 72 </span>
<span class="lineno"> 73 </span>
<span class="lineno"> 74 </span><span class="k">def</span> <span class="nf">grow_tree</span><span class="p">(</span><span class="n">current_tree</span><span class="p">,</span> <span class="n">f_in_tree</span><span class="p">,</span> <span class="n">x_in_node</span><span class="p">,</span> <span class="n">max_depth</span><span class="p">,</span> <span class="n">x_val_sorted</span><span class="p">,</span> <span class="n">x_index_sorted</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">g_tilde</span><span class="p">,</span> <span class="n">h_tilde</span><span class="p">,</span> <span class="n">eta</span><span class="p">,</span> <span class="n">lam</span><span class="p">,</span> <span class="n">gamma</span><span class="p">,</span> <span class="n">min_instances</span><span class="p">):</span>
<span class="lineno"> 75 </span> <span class="sd">'''</span>
<span class="lineno"> 76 </span><span class="sd"> current_tree: the current tree to grow, i.e., a node</span>
<span class="lineno"> 77 </span><span class="sd"> f_in_tree, x_in_node, x_val_sorted, x_index_sorted: see split_node function</span>
<span class="lineno"> 78 </span><span class="sd"> max_depth: maximinum depth to grow</span>
<span class="lineno"> 79 </span><span class="sd"> eta: learning rate</span>
<span class="lineno"> 80 </span><span class="sd"> y: the response variable</span>
<span class="lineno"> 81 </span><span class="sd"> '''</span>
<span class="lineno"> 82 </span> <span class="n">nrow</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">y</span><span class="p">)</span>
<span class="lineno"> 83 </span> <span class="k">if</span> <span class="n">max_depth</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="lineno"> 84 </span> <span class="k">return</span>
<span class="lineno"> 85 </span> <span class="c1"># check if we need a split</span>
<span class="lineno"> 86 </span> <span class="n">do_split</span><span class="p">,</span> <span class="n">best_feature</span><span class="p">,</span> <span class="n">best_break</span><span class="p">,</span> <span class="n">best_location</span><span class="p">,</span> <span class="n">w_left</span><span class="p">,</span> <span class="n">w_right</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">split_node</span><span class="p">(</span>
<span class="lineno"> 87 </span> <span class="n">f_in_tree</span><span class="p">,</span> <span class="n">x_in_node</span><span class="p">,</span> <span class="n">x_val_sorted</span><span class="p">,</span> <span class="n">x_index_sorted</span><span class="p">,</span> <span class="n">g_tilde</span><span class="p">,</span> <span class="n">h_tilde</span><span class="p">,</span> <span class="n">lam</span><span class="p">,</span> <span class="n">gamma</span><span class="p">,</span> <span class="n">min_instances</span><span class="p">)</span>
<span class="lineno"> 88 </span>
<span class="lineno"> 89 </span> <span class="k">if</span> <span class="n">do_split</span><span class="p">:</span>
<span class="lineno"> 90 </span> <span class="c1"># update the value/weight with the learning rate eta</span>
<span class="lineno"> 91 </span> <span class="n">w_left_scaled</span> <span class="o">=</span> <span class="n">w_left</span><span class="o">*</span><span class="n">eta</span>
<span class="lineno"> 92 </span> <span class="n">w_right_scaled</span> <span class="o">=</span> <span class="n">w_right</span><span class="o">*</span><span class="n">eta</span>
<span class="lineno"> 93 </span> <span class="n">current_tree</span><span class="o">.</span><span class="n">variable_id</span> <span class="o">=</span> <span class="n">best_feature</span>
<span class="lineno"> 94 </span> <span class="n">current_tree</span><span class="o">.</span><span class="n">break_point</span> <span class="o">=</span> <span class="n">best_break</span>
<span class="lineno"> 95 </span> <span class="n">current_tree</span><span class="o">.</span><span class="n">val</span> <span class="o">=</span> <span class="kc">None</span>
<span class="lineno"> 96 </span>
<span class="lineno"> 97 </span> <span class="c1"># initialize the left subtree</span>
<span class="lineno"> 98 </span> <span class="n">current_tree</span><span class="o">.</span><span class="n">left</span> <span class="o">=</span> <span class="n">Tree</span><span class="p">(</span><span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="n">w_left_scaled</span><span class="p">)</span>
<span class="lineno"> 99 </span> <span class="c1"># initialize the right subtree</span>
<span class="lineno">100 </span> <span class="n">current_tree</span><span class="o">.</span><span class="n">right</span> <span class="o">=</span> <span class="n">Tree</span><span class="p">(</span><span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="n">w_right_scaled</span><span class="p">)</span>
<span class="lineno">101 </span> <span class="c1"># update if an instance is in left or right</span>
<span class="lineno">102 </span> <span class="n">x_in_left_node</span> <span class="o">=</span> <span class="p">[</span><span class="kc">False</span><span class="p">]</span><span class="o">*</span><span class="nb">len</span><span class="p">(</span><span class="n">x_in_node</span><span class="p">)</span>
<span class="lineno">103 </span> <span class="n">x_in_right_node</span> <span class="o">=</span> <span class="p">[</span><span class="kc">False</span><span class="p">]</span><span class="o">*</span><span class="nb">len</span><span class="p">(</span><span class="n">x_in_node</span><span class="p">)</span>
<span class="lineno">104 </span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">nrow</span><span class="p">):</span>
<span class="lineno">105 </span> <span class="n">index</span> <span class="o">=</span> <span class="n">x_index_sorted</span><span class="p">[</span><span class="n">best_feature</span><span class="p">][</span><span class="n">i</span><span class="p">]</span>
<span class="lineno">106 </span> <span class="k">if</span> <span class="n">x_in_node</span><span class="p">[</span><span class="n">index</span><span class="p">]:</span>
<span class="lineno">107 </span> <span class="k">if</span> <span class="n">i</span> <span class="o"><=</span> <span class="n">best_location</span><span class="p">:</span>
<span class="lineno">108 </span> <span class="n">x_in_left_node</span><span class="p">[</span><span class="n">index</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>
<span class="lineno">109 </span> <span class="k">else</span><span class="p">:</span>
<span class="lineno">110 </span> <span class="n">x_in_right_node</span><span class="p">[</span><span class="n">index</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>
<span class="lineno">111 </span> <span class="c1"># recursively grow its left subtree</span>
<span class="lineno">112 </span> <span class="n">grow_tree</span><span class="p">(</span><span class="n">current_tree</span><span class="o">.</span><span class="n">left</span><span class="p">,</span> <span class="n">f_in_tree</span><span class="p">,</span> <span class="n">x_in_left_node</span><span class="p">,</span> <span class="n">max_depth</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span>
<span class="lineno">113 </span> <span class="n">x_val_sorted</span><span class="p">,</span> <span class="n">x_index_sorted</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">g_tilde</span><span class="p">,</span> <span class="n">h_tilde</span><span class="p">,</span> <span class="n">eta</span><span class="p">,</span> <span class="n">lam</span><span class="p">,</span> <span class="n">gamma</span><span class="p">,</span> <span class="n">min_instances</span><span class="p">)</span>
<span class="lineno">114 </span> <span class="c1"># recursively grow its right subtree</span>
<span class="lineno">115 </span> <span class="n">grow_tree</span><span class="p">(</span><span class="n">current_tree</span><span class="o">.</span><span class="n">right</span><span class="p">,</span> <span class="n">f_in_tree</span><span class="p">,</span> <span class="n">x_in_right_node</span><span class="p">,</span> <span class="n">max_depth</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span>
<span class="lineno">116 </span> <span class="n">x_val_sorted</span><span class="p">,</span> <span class="n">x_index_sorted</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">g_tilde</span><span class="p">,</span> <span class="n">h_tilde</span><span class="p">,</span> <span class="n">eta</span><span class="p">,</span> <span class="n">lam</span><span class="p">,</span> <span class="n">gamma</span><span class="p">,</span> <span class="n">min_instances</span><span class="p">)</span>
<span class="lineno">117 </span> <span class="k">else</span><span class="p">:</span>
<span class="lineno">118 </span> <span class="c1"># current node is a leaf, so we update the value/weight of the leaf</span>
<span class="lineno">119 </span> <span class="n">g</span><span class="p">,</span> <span class="n">h</span> <span class="o">=</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span>
<span class="lineno">120 </span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">e</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">x_in_node</span><span class="p">):</span>
<span class="lineno">121 </span> <span class="k">if</span> <span class="n">e</span><span class="p">:</span>
<span class="lineno">122 </span> <span class="n">g</span> <span class="o">+=</span> <span class="n">g_tilde</span><span class="p">[</span><span class="n">i</span><span class="p">]</span>
<span class="lineno">123 </span> <span class="n">h</span> <span class="o">+=</span> <span class="n">h_tilde</span><span class="p">[</span><span class="n">i</span><span class="p">]</span>
<span class="lineno">124 </span> <span class="n">w_left_scaled</span> <span class="o">=</span> <span class="o">-</span><span class="n">g</span><span class="o">/</span><span class="p">(</span><span class="n">h</span><span class="o">+</span><span class="n">lam</span><span class="p">)</span><span class="o">*</span><span class="n">eta</span>
<span class="lineno">125 </span> <span class="n">current_tree</span><span class="o">.</span><span class="n">val</span> <span class="o">=</span> <span class="n">w_left_scaled</span></code></pre></figure></p>
<p>And the implementation of the <code>GBM</code> class is given below.</p>
<language>Python</language>
<p>chapter7/gbm.py<br />
<figure class="highlight"><pre><code class="language-python3" data-lang="python3"><span></span><span class="lineno"> 1 </span><span class="kn">from</span> <span class="nn">tree</span> <span class="k">import</span> <span class="n">Tree</span>
<span class="lineno"> 2 </span><span class="kn">from</span> <span class="nn">grow</span> <span class="k">import</span> <span class="n">grow_tree</span><span class="p">,</span> <span class="n">split_node</span>
<span class="lineno"> 3 </span><span class="kn">from</span> <span class="nn">utils</span> <span class="k">import</span> <span class="n">gh_lm</span><span class="p">,</span> <span class="n">rmse</span>
<span class="lineno"> 4 </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="lineno"> 5 </span>
<span class="lineno"> 6 </span>
<span class="lineno"> 7 </span><span class="k">class</span> <span class="nc">GBM</span><span class="p">:</span>
<span class="lineno"> 8 </span> <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x_train</span><span class="p">,</span> <span class="n">y_train</span><span class="p">,</span> <span class="n">depth</span><span class="p">,</span> <span class="n">eta</span><span class="p">,</span> <span class="n">lam</span><span class="p">,</span> <span class="n">gamma</span><span class="p">,</span> <span class="n">sub_sample</span><span class="p">,</span> <span class="n">sub_feature</span><span class="p">,</span> <span class="n">min_instances</span><span class="o">=</span><span class="mi">2</span><span class="p">):</span>
<span class="lineno"> 9 </span> <span class="sd">'''</span>
<span class="lineno">10 </span><span class="sd"> x_train, y_train: training data</span>
<span class="lineno">11 </span><span class="sd"> depth: maximum depth of each tree</span>
<span class="lineno">12 </span><span class="sd"> eta: learning rate</span>
<span class="lineno">13 </span><span class="sd"> lam and gamma: regularization parameters</span>
<span class="lineno">14 </span><span class="sd"> sub_sample: control the instance-wise stochasticity</span>
<span class="lineno">15 </span><span class="sd"> sub_feature: control the feature-wise stochasticity</span>
<span class="lineno">16 </span><span class="sd"> min_instances: control the mimimum number of instances under a leaf</span>
<span class="lineno">17 </span><span class="sd"> '''</span>
<span class="lineno">18 </span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">x_train</span><span class="p">)</span>
<span class="lineno">19 </span> <span class="bp">self</span><span class="o">.</span><span class="n">m</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">x_train</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="lineno">20 </span> <span class="bp">self</span><span class="o">.</span><span class="n">x_train</span> <span class="o">=</span> <span class="n">x_train</span>
<span class="lineno">21 </span> <span class="bp">self</span><span class="o">.</span><span class="n">y_train</span> <span class="o">=</span> <span class="n">y_train</span>
<span class="lineno">22 </span> <span class="bp">self</span><span class="o">.</span><span class="n">x_test</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">y_test</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span>
<span class="lineno">23 </span> <span class="bp">self</span><span class="o">.</span><span class="n">depth</span> <span class="o">=</span> <span class="n">depth</span>
<span class="lineno">24 </span> <span class="bp">self</span><span class="o">.</span><span class="n">eta</span> <span class="o">=</span> <span class="n">eta</span>
<span class="lineno">25 </span> <span class="bp">self</span><span class="o">.</span><span class="n">lam</span> <span class="o">=</span> <span class="n">lam</span>
<span class="lineno">26 </span> <span class="bp">self</span><span class="o">.</span><span class="n">gamma</span> <span class="o">=</span> <span class="n">gamma</span>
<span class="lineno">27 </span> <span class="bp">self</span><span class="o">.</span><span class="n">sub_sample</span> <span class="o">=</span> <span class="n">sub_sample</span>
<span class="lineno">28 </span> <span class="bp">self</span><span class="o">.</span><span class="n">sub_feature</span> <span class="o">=</span> <span class="n">sub_feature</span>
<span class="lineno">29 </span> <span class="bp">self</span><span class="o">.</span><span class="n">min_instances</span> <span class="o">=</span> <span class="n">min_instances</span>
<span class="lineno">30 </span>
<span class="lineno">31 </span> <span class="bp">self</span><span class="o">.</span><span class="n">y_tilde</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">*</span><span class="nb">len</span><span class="p">(</span><span class="n">y_train</span><span class="p">)</span>
<span class="lineno">32 </span> <span class="bp">self</span><span class="o">.</span><span class="n">g_tilde</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">h_tilde</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">*</span> \
<span class="lineno">33 </span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">y_tilde</span><span class="p">),</span> <span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">*</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">y_tilde</span><span class="p">)</span>
<span class="lineno">34 </span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">y_tilde</span><span class="p">)):</span>
<span class="lineno">35 </span> <span class="bp">self</span><span class="o">.</span><span class="n">g_tilde</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">h_tilde</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">gh_lm</span><span class="p">(</span>
<span class="lineno">36 </span> <span class="n">y_train</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">y_tilde</span><span class="p">[</span><span class="n">i</span><span class="p">])</span>
<span class="lineno">37 </span> <span class="n">x_columns</span> <span class="o">=</span> <span class="n">x_train</span><span class="o">.</span><span class="n">transpose</span><span class="p">()</span>
<span class="lineno">38 </span> <span class="bp">self</span><span class="o">.</span><span class="n">nf</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">m</span><span class="p">,</span> <span class="nb">max</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="nb">int</span><span class="p">(</span><span class="n">sub_feature</span><span class="o">*</span><span class="bp">self</span><span class="o">.</span><span class="n">m</span><span class="p">)))</span>
<span class="lineno">39 </span> <span class="bp">self</span><span class="o">.</span><span class="n">x_val_sorted</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="n">x_columns</span><span class="p">)</span>
<span class="lineno">40 </span> <span class="bp">self</span><span class="o">.</span><span class="n">x_index_sorted</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">argsort</span><span class="p">(</span><span class="n">x_columns</span><span class="p">)</span>
<span class="lineno">41 </span> <span class="bp">self</span><span class="o">.</span><span class="n">forest</span> <span class="o">=</span> <span class="p">[]</span>
<span class="lineno">42 </span>
<span class="lineno">43 </span> <span class="k">def</span> <span class="nf">set_test_data</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x_test</span><span class="p">,</span> <span class="n">y_test</span><span class="p">):</span>
<span class="lineno">44 </span> <span class="bp">self</span><span class="o">.</span><span class="n">x_test</span> <span class="o">=</span> <span class="n">x_test</span>
<span class="lineno">45 </span> <span class="bp">self</span><span class="o">.</span><span class="n">y_test</span> <span class="o">=</span> <span class="n">y_test</span>
<span class="lineno">46 </span>
<span class="lineno">47 </span> <span class="k">def</span> <span class="nf">predict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x_new</span><span class="p">):</span>
<span class="lineno">48 </span> <span class="n">y_hat</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">0.0</span><span class="p">]</span><span class="o">*</span><span class="nb">len</span><span class="p">(</span><span class="n">x_new</span><span class="p">))</span>
<span class="lineno">49 </span> <span class="k">for</span> <span class="n">tree</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">forest</span><span class="p">:</span>
<span class="lineno">50 </span> <span class="n">y_hat</span> <span class="o">+=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">tree</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">x_new</span><span class="p">))</span>
<span class="lineno">51 </span> <span class="k">return</span> <span class="n">y_hat</span>
<span class="lineno">52 </span>
<span class="lineno">53 </span> <span class="k">def</span> <span class="nf">fit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">max_tree</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="mi">42</span><span class="p">):</span>
<span class="lineno">54 </span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">seed</span><span class="p">(</span><span class="n">seed</span><span class="p">)</span>
<span class="lineno">55 </span> <span class="bp">self</span><span class="o">.</span><span class="n">forest</span> <span class="o">=</span> <span class="p">[]</span>
<span class="lineno">56 </span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span>
<span class="lineno">57 </span> <span class="k">while</span> <span class="n">i</span> <span class="o"><</span> <span class="n">max_tree</span><span class="p">:</span>
<span class="lineno">58 </span> <span class="c1"># let's fit tree i</span>
<span class="lineno">59 </span> <span class="c1"># instance-wise stochasticity</span>
<span class="lineno">60 </span> <span class="n">x_in_node</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">([</span><span class="kc">True</span><span class="p">,</span> <span class="kc">False</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="p">,</span> <span class="n">p</span><span class="o">=</span><span class="p">[</span>
<span class="lineno">61 </span> <span class="bp">self</span><span class="o">.</span><span class="n">sub_sample</span><span class="p">,</span> <span class="mi">1</span><span class="o">-</span><span class="bp">self</span><span class="o">.</span><span class="n">sub_sample</span><span class="p">])</span>
<span class="lineno">62 </span> <span class="c1"># feature-wise stochasticity</span>
<span class="lineno">63 </span> <span class="n">f_in_tree_</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">(</span>
<span class="lineno">64 </span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">m</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">nf</span><span class="p">,</span> <span class="n">replace</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="lineno">65 </span> <span class="n">f_in_tree</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="kc">False</span><span class="p">]</span><span class="o">*</span><span class="bp">self</span><span class="o">.</span><span class="n">m</span><span class="p">)</span>
<span class="lineno">66 </span> <span class="k">for</span> <span class="n">e</span> <span class="ow">in</span> <span class="n">f_in_tree_</span><span class="p">:</span>
<span class="lineno">67 </span> <span class="n">f_in_tree</span><span class="p">[</span><span class="n">e</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>
<span class="lineno">68 </span> <span class="k">del</span> <span class="n">f_in_tree_</span>
<span class="lineno">69 </span> <span class="c1"># initialize the root of this tree</span>
<span class="lineno">70 </span> <span class="n">root</span> <span class="o">=</span> <span class="n">Tree</span><span class="p">(</span><span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="lineno">71 </span> <span class="c1"># grow the tree from root</span>
<span class="lineno">72 </span> <span class="n">grow_tree</span><span class="p">(</span><span class="n">root</span><span class="p">,</span> <span class="n">f_in_tree</span><span class="p">,</span> <span class="n">x_in_node</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">depth</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">x_val_sorted</span><span class="p">,</span>
<span class="lineno">73 </span> <span class="bp">self</span><span class="o">.</span><span class="n">x_index_sorted</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">y_train</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">g_tilde</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">h_tilde</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">eta</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">lam</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">gamma</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">min_instances</span><span class="p">)</span>
<span class="lineno">74 </span> <span class="k">if</span> <span class="n">root</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="lineno">75 </span> <span class="n">i</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="lineno">76 </span> <span class="bp">self</span><span class="o">.</span><span class="n">forest</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">root</span><span class="p">)</span>
<span class="lineno">77 </span> <span class="k">else</span><span class="p">:</span>
<span class="lineno">78 </span> <span class="nb">next</span>
<span class="lineno">79 </span> <span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="p">):</span>
<span class="lineno">80 </span> <span class="bp">self</span><span class="o">.</span><span class="n">y_tilde</span><span class="p">[</span><span class="n">j</span><span class="p">]</span> <span class="o">+=</span> <span class="bp">self</span><span class="o">.</span><span class="n">forest</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">_predict_single</span><span class="p">(</span>
<span class="lineno">81 </span> <span class="bp">self</span><span class="o">.</span><span class="n">x_train</span><span class="p">[</span><span class="n">j</span><span class="p">])</span>
<span class="lineno">82 </span> <span class="bp">self</span><span class="o">.</span><span class="n">g_tilde</span><span class="p">[</span><span class="n">j</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">h_tilde</span><span class="p">[</span><span class="n">j</span><span class="p">]</span> <span class="o">=</span> <span class="n">gh_lm</span><span class="p">(</span>
<span class="lineno">83 </span> <span class="bp">self</span><span class="o">.</span><span class="n">y_train</span><span class="p">[</span><span class="n">j</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">y_tilde</span><span class="p">[</span><span class="n">j</span><span class="p">])</span>
<span class="lineno">84 </span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">x_test</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="lineno">85 </span> <span class="c1"># test on the testing instances</span>
<span class="lineno">86 </span> <span class="n">y_hat</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">x_test</span><span class="p">)</span>
<span class="lineno">87 </span> <span class="nb">print</span><span class="p">(</span><span class="s2">"iter: </span><span class="si">{0:>4}</span><span class="s2"> rmse: </span><span class="si">{1:1.6f}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="lineno">88 </span> <span class="n">i</span><span class="p">,</span> <span class="n">rmse</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">y_test</span><span class="p">,</span> <span class="n">y_hat</span><span class="p">)))</span></code></pre></figure></p>
<p>Now let’s see the performance of our <span class="caps">GBM</span> implemented from scratch.</p>
<language>Python</language>
<p>chapter7/test_gbm.py<br />
<figure class="highlight"><pre><code class="language-python3" data-lang="python3"><span></span><span class="lineno"> 1 </span><span class="kn">from</span> <span class="nn">gbm</span> <span class="k">import</span> <span class="n">GBM</span>
<span class="lineno"> 2 </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="lineno"> 3 </span><span class="kn">from</span> <span class="nn">sklearn</span> <span class="k">import</span> <span class="n">datasets</span>
<span class="lineno"> 4 </span><span class="kn">from</span> <span class="nn">sklearn.utils</span> <span class="k">import</span> <span class="n">shuffle</span>
<span class="lineno"> 5 </span>
<span class="lineno"> 6 </span>
<span class="lineno"> 7 </span><span class="k">def</span> <span class="nf">get_boston_data</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="mi">42</span><span class="p">):</span>
<span class="lineno"> 8 </span> <span class="n">boston</span> <span class="o">=</span> <span class="n">datasets</span><span class="o">.</span><span class="n">load_boston</span><span class="p">()</span>
<span class="lineno"> 9 </span> <span class="n">X</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">shuffle</span><span class="p">(</span><span class="n">boston</span><span class="o">.</span><span class="n">data</span><span class="p">,</span> <span class="n">boston</span><span class="o">.</span><span class="n">target</span><span class="p">,</span> <span class="n">random_state</span><span class="o">=</span><span class="n">seed</span><span class="p">)</span>
<span class="lineno">10 </span> <span class="n">X</span> <span class="o">=</span> <span class="n">X</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
<span class="lineno">11 </span> <span class="n">offset</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">X</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">*</span> <span class="mf">0.8</span><span class="p">)</span>
<span class="lineno">12 </span> <span class="n">X_train</span><span class="p">,</span> <span class="n">y_train</span> <span class="o">=</span> <span class="n">X</span><span class="p">[:</span><span class="n">offset</span><span class="p">],</span> <span class="n">y</span><span class="p">[:</span><span class="n">offset</span><span class="p">]</span>
<span class="lineno">13 </span> <span class="n">X_test</span><span class="p">,</span> <span class="n">y_test</span> <span class="o">=</span> <span class="n">X</span><span class="p">[</span><span class="n">offset</span><span class="p">:],</span> <span class="n">y</span><span class="p">[</span><span class="n">offset</span><span class="p">:]</span>
<span class="lineno">14 </span> <span class="k">return</span> <span class="n">X_train</span><span class="p">,</span> <span class="n">y_train</span><span class="p">,</span> <span class="n">X_test</span><span class="p">,</span> <span class="n">y_test</span>
<span class="lineno">15 </span>
<span class="lineno">16 </span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span>
<span class="lineno">17 </span> <span class="n">X_train</span><span class="p">,</span> <span class="n">y_train</span><span class="p">,</span> <span class="n">X_test</span><span class="p">,</span> <span class="n">y_test</span> <span class="o">=</span> <span class="n">get_boston_data</span><span class="p">(</span><span class="mi">42</span><span class="p">)</span>
<span class="lineno">18 </span> <span class="n">gbm</span> <span class="o">=</span> <span class="n">GBM</span><span class="p">(</span><span class="n">X_train</span><span class="p">,</span> <span class="n">y_train</span><span class="p">,</span> <span class="n">depth</span><span class="o">=</span><span class="mi">6</span><span class="p">,</span> <span class="n">eta</span><span class="o">=</span><span class="mf">0.05</span><span class="p">,</span> <span class="n">lam</span><span class="o">=</span><span class="mf">1.0</span><span class="p">,</span>
<span class="lineno">19 </span> <span class="n">gamma</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">sub_sample</span><span class="o">=</span><span class="mf">0.5</span><span class="p">,</span> <span class="n">sub_feature</span><span class="o">=</span><span class="mf">0.7</span><span class="p">)</span>
<span class="lineno">20 </span> <span class="n">gbm</span><span class="o">.</span><span class="n">set_test_data</span><span class="p">(</span><span class="n">X_test</span><span class="p">,</span> <span class="n">y_test</span><span class="p">)</span>
<span class="lineno">21 </span> <span class="n">gbm</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">max_tree</span><span class="o">=</span><span class="mi">200</span><span class="p">)</span></code></pre></figure></p>
<p>Running the code above, we have output as below.</p>
<language>Python</language>
<figure class="highlight"><pre><code class="language-python3" data-lang="python3"><span></span><span class="lineno"> 1 </span><span class="n">chapter7</span> <span class="err">$</span><span class="n">python3</span><span class="o">.</span><span class="mi">7</span> <span class="n">test_gbm</span><span class="o">.</span><span class="n">py</span>
<span class="lineno"> 2 </span><span class="nb">iter</span><span class="p">:</span> <span class="mi">1</span> <span class="n">rmse</span><span class="p">:</span> <span class="mf">23.063019</span>
<span class="lineno"> 3 </span><span class="nb">iter</span><span class="p">:</span> <span class="mi">2</span> <span class="n">rmse</span><span class="p">:</span> <span class="mf">21.997972</span>
<span class="lineno"> 4 </span><span class="nb">iter</span><span class="p">:</span> <span class="mi">3</span> <span class="n">rmse</span><span class="p">:</span> <span class="mf">21.026602</span>
<span class="lineno"> 5 </span><span class="nb">iter</span><span class="p">:</span> <span class="mi">4</span> <span class="n">rmse</span><span class="p">:</span> <span class="mf">20.043397</span>
<span class="lineno"> 6 </span><span class="nb">iter</span><span class="p">:</span> <span class="mi">5</span> <span class="n">rmse</span><span class="p">:</span> <span class="mf">19.210746</span>
<span class="lineno"> 7 </span><span class="o">...</span>
<span class="lineno"> 8 </span><span class="nb">iter</span><span class="p">:</span> <span class="mi">196</span> <span class="n">rmse</span><span class="p">:</span> <span class="mf">2.560747</span>
<span class="lineno"> 9 </span><span class="nb">iter</span><span class="p">:</span> <span class="mi">197</span> <span class="n">rmse</span><span class="p">:</span> <span class="mf">2.544847</span>
<span class="lineno">10 </span><span class="nb">iter</span><span class="p">:</span> <span class="mi">198</span> <span class="n">rmse</span><span class="p">:</span> <span class="mf">2.541102</span>
<span class="lineno">11 </span><span class="nb">iter</span><span class="p">:</span> <span class="mi">199</span> <span class="n">rmse</span><span class="p">:</span> <span class="mf">2.537366</span>
<span class="lineno">12 </span><span class="nb">iter</span><span class="p">:</span> <span class="mi">200</span> <span class="n">rmse</span><span class="p">:</span> <span class="mf">2.535143</span></code></pre></figure><p>We don’t implement the model in R, but it is not difficult to do based on the Python implementation above.</p>
<p><span class="caps">GBM</span> can be used with various loss functions, and the major difference is the implementation of the first/second order derivatives, i.e., $g$ and $h$.</p>
<p>Regardless of the performance, there are two major missing features in our implementation is a) cross-validation and b) early stopping.<br />
We have talked about cross-validation, but what is early stopping? It is a very useful technique in <span class="caps">GBM</span>. Usually, the cross-validated loss decreases when we add new trees at the beginning and at a certain point, the loss may increase when more trees are fitted (due to overfitting). Thus, we may select the best number of trees based on the cross-validated loss. Specifically, stop the fitting process when the cross-validated loss doesn’t decrease. In practice, we don’t want to stop the fitting immediately when the cross-validated loss starts increasing. Instead, we specify a number of trees, e.g. 50, as a buffer, after which the fitting process should stop if cross-validated loss doesn’t decrease.</p>
<p>Early stopping is also used in other machine learning models, for example, neural network. Ideally, we would like to have early stopping based on the cross-validated loss. But when the training process is time-consuming, it’s fine to use the loss on a testing date set<sup class="footnote" id="fnr14"><a href="#fn14">14</a></sup>.</p>
<p>The commonly used <span class="caps">GBM</span> packages include <code>XGBoost</code><sup class="footnote" id="fnr15"><a href="#fn15">15</a></sup>, <code>LightGBM</code><sup class="footnote" id="fnr16"><a href="#fn16">16</a></sup> and <code>CatBoost</code><sup class="footnote" id="fnr17"><a href="#fn17">17</a></sup>.</p>
<p>Let’s see how to use <code>XGBoost</code> for the same regression task on the Boston dataset.</p>
<language>R</language>
<p>chapter7/xgb.R<br />
<figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="lineno"> 1 </span><span class="kn">library</span><span class="p">(</span>xgboost<span class="p">)</span>
<span class="lineno"> 2 </span><span class="kn">library</span><span class="p">(</span>MASS<span class="p">)</span>
<span class="lineno"> 3 </span><span class="kn">library</span><span class="p">(</span>Metrics<span class="p">)</span>
<span class="lineno"> 4 </span><span class="kp">set.seed</span><span class="p">(</span><span class="m">42</span><span class="p">)</span>
<span class="lineno"> 5 </span>train_index <span class="o">=</span> <span class="kp">sample</span><span class="p">(</span><span class="kp">nrow</span><span class="p">(</span>Boston<span class="p">),</span> <span class="kp">floor</span><span class="p">(</span><span class="m">0.8</span> <span class="o">*</span> <span class="kp">nrow</span><span class="p">(</span>Boston<span class="p">)),</span> replace <span class="o">=</span> <span class="kc">FALSE</span><span class="p">)</span>
<span class="lineno"> 6 </span>Boston <span class="o">=</span> <span class="kp">data.matrix</span><span class="p">(</span>Boston<span class="p">)</span>
<span class="lineno"> 7 </span>target_col <span class="o">=</span> <span class="kp">which</span><span class="p">(</span><span class="kp">colnames</span><span class="p">(</span>Boston<span class="p">)</span> <span class="o">==</span> <span class="s">'medv'</span><span class="p">)</span>
<span class="lineno"> 8 </span>X_train <span class="o">=</span> Boston<span class="p">[</span>train_index<span class="p">,</span> <span class="o">-</span>target_col<span class="p">]</span>
<span class="lineno"> 9 </span>y_train <span class="o">=</span> Boston<span class="p">[</span>train_index<span class="p">,</span> target_col<span class="p">]</span>
<span class="lineno">10 </span>X_test <span class="o">=</span> Boston<span class="p">[</span><span class="o">-</span>train_index<span class="p">,</span> <span class="o">-</span>target_col<span class="p">]</span>
<span class="lineno">11 </span>y_test <span class="o">=</span> Boston<span class="p">[</span><span class="o">-</span>train_index<span class="p">,</span> target_col<span class="p">]</span>
<span class="lineno">12 </span><span class="c1"># prepare the data for training and testing</span>
<span class="lineno">13 </span>dTrain <span class="o">=</span> xgb.DMatrix<span class="p">(</span>X_train<span class="p">,</span> label <span class="o">=</span> y_train<span class="p">)</span>
<span class="lineno">14 </span>dTest <span class="o">=</span> xgb.DMatrix<span class="p">(</span>X_test<span class="p">)</span>
<span class="lineno">15 </span>params <span class="o">=</span> <span class="kt">list</span><span class="p">(</span>
<span class="lineno">16 </span> <span class="s">"booster"</span> <span class="o">=</span> <span class="s">"gbtree"</span><span class="p">,</span>
<span class="lineno">17 </span> <span class="s">"objective"</span> <span class="o">=</span> <span class="s">"reg:linear"</span><span class="p">,</span>
<span class="lineno">18 </span> <span class="s">"eta"</span> <span class="o">=</span> <span class="m">0.1</span><span class="p">,</span>
<span class="lineno">19 </span> <span class="s">"max_depth"</span> <span class="o">=</span> <span class="m">5</span><span class="p">,</span>
<span class="lineno">20 </span> <span class="s">"subsample"</span> <span class="o">=</span> <span class="m">0.6</span><span class="p">,</span>
<span class="lineno">21 </span> <span class="s">"colsample_bytree"</span> <span class="o">=</span> <span class="m">0.8</span><span class="p">,</span>
<span class="lineno">22 </span> <span class="s">"min_child_weight"</span> <span class="o">=</span> <span class="m">2</span>
<span class="lineno">23 </span><span class="p">)</span>
<span class="lineno">24 </span><span class="c1"># run the cross-validation</span>
<span class="lineno">25 </span>hist <span class="o">=</span> xgb.cv<span class="p">(</span>
<span class="lineno">26 </span> params <span class="o">=</span> params<span class="p">,</span>
<span class="lineno">27 </span> data <span class="o">=</span> dTrain<span class="p">,</span>
<span class="lineno">28 </span> nrounds <span class="o">=</span> <span class="m">500</span><span class="p">,</span>
<span class="lineno">29 </span> early_stopping_rounds <span class="o">=</span> <span class="m">50</span><span class="p">,</span>
<span class="lineno">30 </span> metrics <span class="o">=</span> <span class="s">'rmse'</span><span class="p">,</span>
<span class="lineno">31 </span> nfold <span class="o">=</span> <span class="m">5</span><span class="p">,</span>
<span class="lineno">32 </span> verbose <span class="o">=</span> <span class="kc">FALSE</span>
<span class="lineno">33 </span><span class="p">)</span>
<span class="lineno">34 </span><span class="c1"># since we have the best number of trees from cv, let's train the model with this number of trees</span>
<span class="lineno">35 </span>model <span class="o">=</span> xgb.train<span class="p">(</span>params<span class="p">,</span> nrounds <span class="o">=</span> hist<span class="o">$</span>best_iteration<span class="p">,</span> data <span class="o">=</span> dTrain<span class="p">)</span>
<span class="lineno">36 </span>pred <span class="o">=</span> predict<span class="p">(</span>model<span class="p">,</span> dTest<span class="p">)</span>
<span class="lineno">37 </span>
<span class="lineno">38 </span><span class="kp">cat</span><span class="p">(</span>
<span class="lineno">39 </span> <span class="s">"rmse on testing instances is"</span><span class="p">,</span>
<span class="lineno">40 </span> rmse<span class="p">(</span>y_test<span class="p">,</span> pred<span class="p">),</span>
<span class="lineno">41 </span> <span class="s">"with"</span><span class="p">,</span>
<span class="lineno">42 </span> hist<span class="o">$</span>best_iteration<span class="p">,</span>
<span class="lineno">43 </span> <span class="s">"trees"</span>
<span class="lineno">44 </span><span class="p">)</span></code></pre></figure></p>
<language>Python</language>
<p>chapter7/xgb.py<br />
<figure class="highlight"><pre><code class="language-python3" data-lang="python3"><span></span><span class="lineno"> 1 </span><span class="kn">import</span> <span class="nn">xgboost</span> <span class="k">as</span> <span class="nn">xgb</span>
<span class="lineno"> 2 </span><span class="kn">from</span> <span class="nn">sklearn</span> <span class="k">import</span> <span class="n">datasets</span>
<span class="lineno"> 3 </span><span class="kn">from</span> <span class="nn">sklearn.utils</span> <span class="k">import</span> <span class="n">shuffle</span>
<span class="lineno"> 4 </span><span class="kn">from</span> <span class="nn">sklearn.metrics</span> <span class="k">import</span> <span class="n">mean_squared_error</span>
<span class="lineno"> 5 </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="lineno"> 6 </span>
<span class="lineno"> 7 </span><span class="n">seed</span> <span class="o">=</span> <span class="mi">42</span>
<span class="lineno"> 8 </span><span class="n">boston</span> <span class="o">=</span> <span class="n">datasets</span><span class="o">.</span><span class="n">load_boston</span><span class="p">()</span>
<span class="lineno"> 9 </span><span class="n">X</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">shuffle</span><span class="p">(</span><span class="n">boston</span><span class="o">.</span><span class="n">data</span><span class="p">,</span> <span class="n">boston</span><span class="o">.</span><span class="n">target</span><span class="p">,</span> <span class="n">random_state</span><span class="o">=</span><span class="n">seed</span><span class="p">)</span>
<span class="lineno">10 </span><span class="n">X</span> <span class="o">=</span> <span class="n">X</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
<span class="lineno">11 </span><span class="n">offset</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">X</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">*</span> <span class="mf">0.8</span><span class="p">)</span>
<span class="lineno">12 </span><span class="n">X_train</span><span class="p">,</span> <span class="n">y_train</span> <span class="o">=</span> <span class="n">X</span><span class="p">[:</span><span class="n">offset</span><span class="p">],</span> <span class="n">y</span><span class="p">[:</span><span class="n">offset</span><span class="p">]</span>
<span class="lineno">13 </span><span class="n">X_test</span><span class="p">,</span> <span class="n">y_test</span> <span class="o">=</span> <span class="n">X</span><span class="p">[</span><span class="n">offset</span><span class="p">:],</span> <span class="n">y</span><span class="p">[</span><span class="n">offset</span><span class="p">:]</span>
<span class="lineno">14 </span>
<span class="lineno">15 </span><span class="n">params</span> <span class="o">=</span> <span class="p">{</span><span class="s1">'booster'</span><span class="p">:</span> <span class="s1">'gbtree'</span><span class="p">,</span> <span class="s1">'objective'</span><span class="p">:</span> <span class="s1">'reg:linear'</span><span class="p">,</span> <span class="s1">'learning_rate'</span><span class="p">:</span> <span class="mf">0.1</span><span class="p">,</span>
<span class="lineno">16 </span> <span class="s1">'max_depth'</span><span class="p">:</span> <span class="mi">5</span><span class="p">,</span> <span class="s1">'subsample'</span><span class="p">:</span> <span class="mf">0.6</span><span class="p">,</span> <span class="s1">'colsample_bytree'</span><span class="p">:</span> <span class="mf">0.8</span><span class="p">,</span> <span class="s1">'min_child_weight'</span><span class="p">:</span> <span class="mi">2</span><span class="p">}</span>
<span class="lineno">17 </span>
<span class="lineno">18 </span><span class="c1"># prepare the data for training and testing</span>
<span class="lineno">19 </span><span class="n">dtrain</span> <span class="o">=</span> <span class="n">xgb</span><span class="o">.</span><span class="n">DMatrix</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="n">X_train</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="n">y_train</span><span class="p">,</span> <span class="n">missing</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
<span class="lineno">20 </span><span class="n">dtest</span> <span class="o">=</span> <span class="n">xgb</span><span class="o">.</span><span class="n">DMatrix</span><span class="p">(</span><span class="n">X_test</span><span class="p">)</span>
<span class="lineno">21 </span>
<span class="lineno">22 </span><span class="c1"># run 5-fold cross-validation with maximum 1000 trees, and try to minimize the metric rmse</span>
<span class="lineno">23 </span><span class="c1"># early stopping 50 trees</span>
<span class="lineno">24 </span><span class="n">hist</span> <span class="o">=</span> <span class="n">xgb</span><span class="o">.</span><span class="n">cv</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">dtrain</span><span class="o">=</span><span class="n">dtrain</span><span class="p">,</span> <span class="n">nfold</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span>
<span class="lineno">25 </span> <span class="n">metrics</span><span class="o">=</span><span class="p">[</span><span class="s1">'rmse'</span><span class="p">],</span> <span class="n">num_boost_round</span><span class="o">=</span><span class="mi">1000</span><span class="p">,</span> <span class="n">maximize</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">early_stopping_rounds</span><span class="o">=</span><span class="mi">50</span><span class="p">)</span>
<span class="lineno">26 </span>
<span class="lineno">27 </span><span class="c1"># find the best number of trees from the cross-validation history</span>
<span class="lineno">28 </span><span class="n">best_number_trees</span> <span class="o">=</span> <span class="n">hist</span><span class="p">[</span><span class="s1">'test-rmse-mean'</span><span class="p">]</span><span class="o">.</span><span class="n">idxmin</span><span class="p">()</span>
<span class="lineno">29 </span>
<span class="lineno">30 </span><span class="c1"># since we have the best number of trees from cv, let's train the model with this number of trees</span>
<span class="lineno">31 </span><span class="n">model</span> <span class="o">=</span> <span class="n">xgb</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">dtrain</span><span class="p">,</span> <span class="n">num_boost_round</span><span class="o">=</span><span class="n">best_number_trees</span><span class="p">)</span>
<span class="lineno">32 </span><span class="n">pred</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">dtest</span><span class="p">)</span>
<span class="lineno">33 </span><span class="nb">print</span><span class="p">(</span>
<span class="lineno">34 </span> <span class="n">f</span><span class="s2">"rmse on testing instances is {mean_squared_error(pred, y_test)**0.5:.6f} with </span><span class="si">{best_number_trees}</span><span class="s2"> trees"</span><span class="p">)</span></code></pre></figure></p>
<p>The parameters <code>subsample</code> and <code>colsample_bytree</code> control the stochasticity, within the range of $[0, 1]$. If we set these two parameters to 1, then all instances and all features are selected for fitting every tree.</p>
<p>The two code snippets illustrate a minimal workflow of fitting a <span class="caps">GBM</span> model. First, we conduct (hyper) parameter tuning (such as learning rate, number of trees, regularization parameters, stochasticity parameters) with cross-validation, and next we train the model with the tuned parameters.</p>
<p>Running the code snippets, we have the following results.</p>
<div class="codewrapper">
<div class="codeleft">
<language>R</language>
<figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="lineno">1 </span><span class="o">></span> <span class="kn">source</span><span class="p">(</span><span class="s">'xgb.R'</span><span class="p">)</span>
<span class="lineno">2 </span>rmse on testing instances is <span class="m">2.632298</span> with <span class="m">83</span> trees</code></pre></figure></div>
<div class="coderight">
<language>Python</language>
<figure class="highlight"><pre><code class="language-python3" data-lang="python3"><span></span><span class="lineno">1 </span><span class="n">chapter7</span> <span class="err">$</span><span class="n">python3</span><span class="o">.</span><span class="mi">7</span> <span class="n">xgb</span><span class="o">.</span><span class="n">py</span>
<span class="lineno">2 </span><span class="n">rmse</span> <span class="n">on</span> <span class="n">testing</span> <span class="n">instances</span> <span class="ow">is</span> <span class="mf">2.736038</span> <span class="k">with</span> <span class="mi">179</span> <span class="n">trees</span></code></pre></figure></div>
</div>
<p>In <code>XGBoost</code>, we could also use linear regression models as the booster (or base learner) instead of decision trees. However, when <code>'booster':'gblinear'</code> is used, the sum of the prediction from all boosters in the model is equivalent to the prediction from a single (combined) linear model. In that sense, what we get is just a Lasso solution of a linear regression model.</p>
<p><span class="caps">GBM</span> can be used in different tasks, such as classification, ranking, survival analysis, etc. When we use <span class="caps">GBM</span> for predictive modeling, missing value imputation is not required, which is one big advantage over linear models. But in our own implementation we don’t consider missing values for simplicity. In <span class="caps">GBM</span>, if a feature is categorical we could do label-encoding<sup class="footnote" id="fnr18"><a href="#fn18">18</a></sup>, i.e., mapping the feature to integers directly without creating dummy variables (such as one-hot encoding). Of course one-hot encoding<sup class="footnote" id="fnr19"><a href="#fn19">19</a></sup> can also be used. But when there are too many new columns created by one-hot encoding, the probability that the original categorical feature is selected is higher than these numerical variables. In other words, we are assigning a prior weight to the categorical feature regarding the feature-wise stochasticity.</p>
<p>For quite a few real-world prediction problems, the monotonic constraints are desired. Monotonic constraints are either increasing or decreasing. The increasing constraint for feature $x_k$ refer to the relationship that $f(x_1,…,x_k,…,x_m)\le f(x_1,…,x_k’,…,x_m)$ if $x_k\le x_k’$. For example, an increasing constraint for the number of bedrooms in a house price prediction model makes lots of sense. Using gradient boosting tree regression models we can enforce such monotonic constraints in a straightforward manner. Simply, after we get the best split for the current node, we may check if the monotonic constraint is violated by the split. The split won’t be adopted if the constraint is broken.</p>
<h2 id="ul">Unsupervised learning</h2>
<p>For many supervised learning tasks, we could formulate the problem as an optimization problem by writing down the loss function as a function of training input and output. In unsupervised learning problems there are no label/output. It is more difficult to formulate unsupervised learning problems in a unified approach. Still, some unsupervised learning problems can still be formulated as an optimization problem. Let’s see a few examples briefly.</p>
<h3>Principal component analysis (<span class="caps">PCA</span>)</h3>
<p><span class="caps">PCA</span> is a very popular technique in many Engineering disciplines. As you may heard of <span class="caps">PCA</span>, it is a technique for dimension reduction. More specific, let $\boldsymbol{x}$ denote a $n*m$ matrix, and each row of $\boldsymbol{x}$ denoted as $\boldsymbol{x_i}’$ represents a point in $\mathbb{R}^m$. Sometimes the dimension $m$ could be relatively large and we don’t like that. In order to represent the data in a more compact way, we want to transform the raw data points into a new coordinate system in $\mathbb{R}^p$ where $p<m$. The $k^{th};k=1,…,p$ coordinate of $\boldsymbol{x_i}$ in the new coordinate system can be written $\boldsymbol{w_k}’\boldsymbol{x_i}$. However, there are infinite $\boldsymbol{w_k}$ for the transformation and we have to make a guidance for such transformation. The key of our guidance is to make the data points projected onto the first transformed coordinate have the largest variance, and $\boldsymbol{w_1}’\boldsymbol{x_i}$ is called the first principal component. And we could find the remaining transformations and principal components iteratively.</p>
<p>So now we see how the <span class="caps">PCA</span> is formulated as an optimization problem. However, under the above setting, there are infinite solutions for $\boldsymbol{w_k}$. We usually add a constraint on $\boldsymbol{w_k}$ in <span class="caps">PCA</span>, i.e., $|\boldsymbol{w_k}|=1$. The solution to this optimization problem is surprisingly elegant – the optimal $\boldsymbol{w_k}$ is the eigen vectors of the covariance matrix of $\boldsymbol{x}$. Now let’s try to conduct a <span class="caps">PCA</span> with eigen decomposition (it can also be done with other decompositions).</p>
<language>R</language>
<figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="lineno"> 1 </span><span class="o">></span> <span class="kp">set.seed</span><span class="p">(</span><span class="m">42</span><span class="p">)</span>
<span class="lineno"> 2 </span><span class="o">></span> n <span class="o">=</span> <span class="m">1000</span>
<span class="lineno"> 3 </span><span class="o">></span> <span class="c1"># we simulate some data points on a 2d plane</span>
<span class="lineno"> 4 </span><span class="o">></span> x1 <span class="o">=</span> rexp<span class="p">(</span>n<span class="p">)</span>
<span class="lineno"> 5 </span><span class="o">></span> x2 <span class="o">=</span> x1<span class="o">*</span><span class="m">3</span> <span class="o">+</span> rnorm<span class="p">(</span>n<span class="p">,</span> <span class="m">12</span><span class="p">,</span> <span class="m">2</span><span class="p">)</span>
<span class="lineno"> 6 </span><span class="o">></span> x <span class="o">=</span> <span class="kp">cbind</span><span class="p">(</span>x1<span class="p">,</span> x2<span class="p">)</span>
<span class="lineno"> 7 </span><span class="o">></span> <span class="c1"># total marginal variance</span>
<span class="lineno"> 8 </span><span class="o">></span> <span class="kp">sum</span><span class="p">(</span><span class="kp">diag</span><span class="p">(</span>cov<span class="p">(</span>x<span class="p">)))</span>
<span class="lineno"> 9 </span><span class="p">[</span><span class="m">1</span><span class="p">]</span> <span class="m">15.54203</span>
<span class="lineno">10 </span><span class="o">></span> pca_vectors <span class="o">=</span> <span class="kp">eigen</span><span class="p">(</span>cov<span class="p">(</span>x<span class="p">))</span><span class="o">$</span>vectors
<span class="lineno">11 </span><span class="o">></span> <span class="c1"># find the projection on the new coordinate system</span>
<span class="lineno">12 </span><span class="o">></span> z <span class="o">=</span> x <span class="o">%*%</span> pca_vectors
<span class="lineno">13 </span><span class="o">></span> <span class="c1"># total marginal variance after transformation</span>
<span class="lineno">14 </span><span class="o">></span> <span class="kp">sum</span><span class="p">(</span><span class="kp">diag</span><span class="p">(</span>cov<span class="p">(</span>z<span class="p">)))</span>
<span class="lineno">15 </span><span class="p">[</span><span class="m">1</span><span class="p">]</span> <span class="m">15.54203</span></code></pre></figure><figure class="text-center">
<img src="figures/pca.png" alt="Original vs transformed data points via PCA" style="display: block;margin: auto;" width="75%">
<figcaption class="centerfigcaption">Original vs transformed data points via <span class="caps">PCA</span></figcaption>
</figure>
<p>In fact, it does not matter if the raw data points are centered or not if the eigen decomposition is on the covariance matrix. If you prefer to decompose $\boldsymbol{x’}\boldsymbol{x}$ directly, centering is a necessary step. Many times, we don’t want to perform <span class="caps">PCA</span> in this way since there are a lot of functions/packages available in both R and Python.</p>
<h3>Mixture model</h3>
<p>In previous chapters we have seen how to fit a distribution to data. What if the data points actually come from multiple distributions, for example, a mixture of two Gaussian distribution. If we know from which distribution each observed data point comes from it is not difficult to estimate the parameters. But in some situations it is impossible to tell the actual distribution a point is sampled from. Usually, a mixture of multiple distributions can be estimated by maximum likelihood method. We can derive the likelihood function of the observed data points and then we have an optimization problem.</p>
<p>Suppose we have a random size with sample size $n$, and each sample $x_i;i=1,…,n$ is from one of the $K$ multivariate Guassian distributions. The $k^{th}$ distribution is denoted as $\mathcal{N}_k(\mu_k,\Sigma_k)$. We want to estimate the parameters in these Gaussian distributions.</p>
<p>As we talked in Chapter 4, there are two commonly used approaches for distribution fitting, i.e., method of moments and maximum likelihood estimation. In this case, we use the maximum likelihood estimation because the likelihood function can be easily derived as below.</p>
<p>$$<br />
\begin{equation}<br />
\mathcal{P} = \prod_{i=1}^{n}{\sum_{k=1}^{K}{\pi_k}f(x_i|\mu_k,\Sigma_k) },<br />
\label{eq:gmm1}<br />
\end{equation}<br />
$$</p>
<p>where $\pi_k$ represents the probability that a randomly selected data point belongs to distribution $k$.<br />
And thus, the log-likelihood function becomes:</p>
<p>$$<br />
\begin{equation}<br />
\mathcal{L} = \sum_{i=1}^{n}log(\sum_{k=1}^{K}{\pi_k}f(x_i|\mu_k,\Sigma_k)).<br />
\label{eq:gmm2}<br />
\end{equation}<br />
$$</p>
<p>Let’s try to implement the above idea in R with the <code>optim</code> function.</p>
<language>R</language>
<figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="lineno"> 1 </span><span class="o">></span> <span class="kn">library</span><span class="p">(</span>mvtnorm<span class="p">)</span>
<span class="lineno"> 2 </span><span class="o">></span> <span class="kp">set.seed</span><span class="p">(</span><span class="m">42</span><span class="p">)</span>
<span class="lineno"> 3 </span><span class="o">></span> n <span class="o">=</span> <span class="m">1000</span>
<span class="lineno"> 4 </span><span class="o">></span> <span class="c1"># 60% samples are from distribution 1 and the remainings are from distribution 2</span>
<span class="lineno"> 5 </span><span class="o">></span> p <span class="o">=</span> <span class="m">0.6</span>
<span class="lineno"> 6 </span><span class="o">></span> n1 <span class="o">=</span> n <span class="o">*</span> p
<span class="lineno"> 7 </span><span class="o">></span> n2 <span class="o">=</span> n <span class="o">-</span> n1
<span class="lineno"> 8 </span><span class="o">></span> mu1 <span class="o">=</span> <span class="kt">c</span><span class="p">(</span><span class="m">-1</span><span class="p">,</span> <span class="m">-1</span><span class="p">)</span>
<span class="lineno"> 9 </span><span class="o">></span> mu2 <span class="o">=</span> <span class="kt">c</span><span class="p">(</span><span class="m">1</span><span class="p">,</span> <span class="m">1</span><span class="p">)</span>
<span class="lineno">10 </span><span class="o">></span> sigma1 <span class="o">=</span> <span class="kt">array</span><span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="m">1</span><span class="p">,</span> <span class="m">0.5</span><span class="p">,</span> <span class="m">0.5</span><span class="p">,</span> <span class="m">1</span><span class="p">),</span> <span class="kt">c</span><span class="p">(</span><span class="m">2</span><span class="p">,</span> <span class="m">2</span><span class="p">))</span>
<span class="lineno">11 </span><span class="o">></span> sigma2 <span class="o">=</span> <span class="kt">array</span><span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="m">1</span><span class="p">,</span> <span class="m">-0.2</span><span class="p">,</span> <span class="m">-0.2</span><span class="p">,</span> <span class="m">1</span><span class="p">),</span> <span class="kt">c</span><span class="p">(</span><span class="m">2</span><span class="p">,</span> <span class="m">2</span><span class="p">))</span>
<span class="lineno">12 </span><span class="o">></span> x1 <span class="o">=</span> rmvnorm<span class="p">(</span>n <span class="o">=</span> n1<span class="p">,</span> mean <span class="o">=</span> mu1<span class="p">,</span> sigma <span class="o">=</span> sigma1<span class="p">)</span>
<span class="lineno">13 </span><span class="o">></span> x2 <span class="o">=</span> rmvnorm<span class="p">(</span>n <span class="o">=</span> n2<span class="p">,</span> mean <span class="o">=</span> mu2<span class="p">,</span> sigma <span class="o">=</span> sigma2<span class="p">)</span>
<span class="lineno">14 </span><span class="o">></span> x <span class="o">=</span> <span class="kp">rbind</span><span class="p">(</span>x1<span class="p">,</span> x2<span class="p">)</span>
<span class="lineno">15 </span><span class="o">></span> <span class="c1"># let's permute x</span>
<span class="lineno">16 </span><span class="o">></span> x_permuted <span class="o">=</span> x<span class="p">[</span><span class="kp">sample</span><span class="p">(</span>n<span class="p">),</span> <span class="p">]</span>
<span class="lineno">17 </span><span class="o">></span>
<span class="lineno">18 </span><span class="o">></span>
<span class="lineno">19 </span><span class="o">></span> log_lik <span class="o">=</span> <span class="kr">function</span><span class="p">(</span>theta<span class="p">,</span> x<span class="p">)</span> <span class="p">{</span>
<span class="lineno">20 </span><span class="o">+</span> <span class="c1"># theta is a vector of length 9</span>
<span class="lineno">21 </span><span class="o">+</span> <span class="c1"># first let's reparametrize the parameters</span>
<span class="lineno">22 </span><span class="o">+</span> mu1 <span class="o">=</span> theta<span class="p">[</span><span class="m">1</span><span class="o">:</span><span class="m">2</span><span class="p">]</span>
<span class="lineno">23 </span><span class="o">+</span> mu2 <span class="o">=</span> theta<span class="p">[</span><span class="m">3</span><span class="o">:</span><span class="m">4</span><span class="p">]</span>
<span class="lineno">24 </span><span class="o">+</span> sigma1 <span class="o">=</span> <span class="kt">array</span><span class="p">(</span><span class="kt">c</span><span class="p">(</span>theta<span class="p">[</span><span class="m">5</span><span class="p">],</span> theta<span class="p">[</span><span class="m">6</span><span class="p">],</span> theta<span class="p">[</span><span class="m">6</span><span class="p">],</span> theta<span class="p">[</span><span class="m">5</span><span class="p">]),</span> <span class="kt">c</span><span class="p">(</span><span class="m">2</span><span class="p">,</span> <span class="m">2</span><span class="p">))</span>
<span class="lineno">25 </span><span class="o">+</span> sigma2 <span class="o">=</span> <span class="kt">array</span><span class="p">(</span><span class="kt">c</span><span class="p">(</span>theta<span class="p">[</span><span class="m">7</span><span class="p">],</span> theta<span class="p">[</span><span class="m">8</span><span class="p">],</span> theta<span class="p">[</span><span class="m">8</span><span class="p">],</span> theta<span class="p">[</span><span class="m">7</span><span class="p">]),</span> <span class="kt">c</span><span class="p">(</span><span class="m">2</span><span class="p">,</span> <span class="m">2</span><span class="p">))</span>
<span class="lineno">26 </span><span class="o">+</span> pi_1 <span class="o">=</span> theta<span class="p">[</span><span class="m">9</span><span class="p">]</span>
<span class="lineno">27 </span><span class="o">+</span> pi_2 <span class="o">=</span> <span class="m">1</span> <span class="o">-</span> pi_1
<span class="lineno">28 </span><span class="o">+</span> <span class="c1"># we return the negative log-likelihood</span>
<span class="lineno">29 </span><span class="o">+</span> <span class="o">-</span> <span class="kp">sum</span><span class="p">(</span><span class="kp">log</span><span class="p">(</span>pi_1 <span class="o">*</span> dmvnorm<span class="p">(</span>x<span class="p">,</span> mu1<span class="p">,</span> sigma1<span class="p">)</span> <span class="o">+</span> pi_2 <span class="o">*</span> dmvnorm<span class="p">(</span>x<span class="p">,</span> mu2<span class="p">,</span> sigma2<span class="p">)))</span>
<span class="lineno">30 </span><span class="o">+</span> <span class="p">}</span>
<span class="lineno">31 </span><span class="o">></span>