|
1 | 1 | { |
2 | 2 | "metadata": { |
3 | 3 | "name": "", |
4 | | - "signature": "sha256:0d06f7c0dcbe740f33bc07e6b2f3b75bd99464160080eeb0dc89c4b381ba494a" |
| 4 | + "signature": "sha256:945f626408fb0a4ebaefe616111b44b5fbd70d5fef9361265f942a7392cac81e" |
5 | 5 | }, |
6 | 6 | "nbformat": 3, |
7 | 7 | "nbformat_minor": 0, |
|
70 | 70 | " <td> 0</td>\n", |
71 | 71 | " </tr>\n", |
72 | 72 | " <tr>\n", |
73 | | - " <th>ham</th>\n", |
| 73 | + " <th>spam</th>\n", |
74 | 74 | " <td> 0</td>\n", |
75 | 75 | " <td> 1</td>\n", |
76 | 76 | " <td> 1</td>\n", |
77 | 77 | " <td> 0</td>\n", |
78 | 78 | " <td> 0</td>\n", |
79 | 79 | " </tr>\n", |
80 | 80 | " <tr>\n", |
81 | | - " <th>ham</th>\n", |
| 81 | + " <th>spam</th>\n", |
82 | 82 | " <td> 0</td>\n", |
83 | 83 | " <td> 0</td>\n", |
84 | 84 | " <td> 0</td>\n", |
|
332 | 332 | " </tbody>\n", |
333 | 333 | "</table>\n", |
334 | 334 | "\n", |
335 | | - "What is the probability of $P(y=\\textrm{Walt Whitman}|x = [7, 4, 0, 0, 0, 12, 6, 8, 3, 0])$? And what is the probability of $P(y=\\textrm{J.K. Rowling}|x = [7, 4, 0, 0, 0, 12, 6, 8, 3, 0])$?" |
| 335 | + "What is the probability of $P(y=\\textrm{Walt Whitman}|x = [12, 10, 1, 8, 0, 4, 0, 0, 0, 4])$? And what is the probability of $P(y=\\textrm{J.K. Rowling}|x = [7, 4, 0, 0, 0, 12, 6, 8, 3, 0])$?" |
336 | 336 | ] |
337 | 337 | }, |
338 | 338 | { |
|
473 | 473 | "cell_type": "markdown", |
474 | 474 | "metadata": {}, |
475 | 475 | "source": [ |
476 | | - "**d)** Now that we know how to compute the posterior probability, it is time to implement our naive bayes learner. We will start with implementing the `fit` method. The `fit` method has two core jobs:\n", |
| 476 | + "**d)** Now that we know how to compute the posterior probability, it is time to implement our naive bayes learner. We will start with implementing the `fit` method. The `fit` method has three core jobs:\n", |
477 | 477 | "\n", |
478 | 478 | "1. extract all the counts of each feature given each class;\n", |
479 | | - "2. count how often each class label occurs in the training data.\n", |
| 479 | + "2. count how often each class label occurs in the training data;\n", |
| 480 | + "3. count the number of unique features.\n", |
480 | 481 | "\n", |
481 | 482 | "The `NaiveBayesLearner` below provides the skeleton of our class. Implement the `fit` method." |
482 | 483 | ] |
|
502 | 503 | " self.N = defaultdict(Counter) # insert your code here (feature counts per class)\n", |
503 | 504 | " for x, y_x in zip(X, y):\n", |
504 | 505 | " self.N[y_x] += Counter(x)\n", |
| 506 | + " self.V = len(set(x for y_x in self.N for x in self.N[y_x])) # number of unique features\n", |
505 | 507 | " \n", |
506 | 508 | " def predict(self, x):\n", |
507 | 509 | " \"\"\"Predict the outcome for example x. Choose the most\n", |
|
572 | 574 | " self.N = defaultdict(Counter)\n", |
573 | 575 | " for x, y_x in zip(X, y):\n", |
574 | 576 | " self.N[y_x] += Counter(x)\n", |
| 577 | + " self.V = len(set(x for y_x in self.N for x in self.N[y_x]))\n", |
575 | 578 | "\n", |
576 | 579 | " def prior(self, y):\n", |
577 | 580 | " \"\"\"Return the prior probability of class y.\"\"\"\n", |
|
583 | 586 | " \"\"\"Apply Laplace Smoothing to give a probability\n", |
584 | 587 | " estimate of feature x given y.\"\"\"\n", |
585 | 588 | " # insert your code here\n", |
586 | | - " return (self.N[y][x] + 1.0) / (sum(self.N[y].values()) + len(self.N))\n", |
| 589 | + " return (self.N[y][x] + 1.0) / (sum(self.N[y].values()) + self.V)\n", |
587 | 590 | "\n", |
588 | 591 | "# these tests should return True if your code is correct\n", |
589 | 592 | "nb = NaiveBayesLearner()\n", |
|
706 | 709 | " self.N = defaultdict(Counter)\n", |
707 | 710 | " for x, y_x in zip(X, y):\n", |
708 | 711 | " self.N[y_x] += Counter(x)\n", |
| 712 | + " self.V = len(set(x for y_x in self.N for x in self.N[y_x]))\n", |
709 | 713 | "\n", |
710 | 714 | " def prior(self, y):\n", |
711 | 715 | " \"\"\"Return the prior probability of class y.\"\"\"\n", |
|
714 | 718 | " def probability(self, x, y):\n", |
715 | 719 | " \"\"\"Apply Laplace Smoothing to give a probability\n", |
716 | 720 | " estimate of feature x given y.\"\"\"\n", |
717 | | - " return (self.N[y][x] + 1.0) / (sum(self.N[y].values()) + len(self.N))\n", |
| 721 | + " return (self.N[y][x] + 1.0) / (sum(self.N[y].values()) + self.V)\n", |
718 | 722 | "\n", |
719 | 723 | " def predict(self, x):\n", |
720 | 724 | " \"\"\"Predict the outcome for example x. Choose the most\n", |
|
1659 | 1663 | "input": [ |
1660 | 1664 | "from IPython.core.display import HTML\n", |
1661 | 1665 | "def css_styling():\n", |
1662 | | - " styles = open(\"styles/custom.css\", \"r\").read()\n", |
| 1666 | + " styles = open(\"../styles/custom.css\", \"r\").read()\n", |
1663 | 1667 | " return HTML(styles)\n", |
1664 | 1668 | "css_styling()" |
1665 | 1669 | ], |
|
1723 | 1727 | ], |
1724 | 1728 | "metadata": {}, |
1725 | 1729 | "output_type": "pyout", |
1726 | | - "prompt_number": 370, |
| 1730 | + "prompt_number": 10, |
1727 | 1731 | "text": [ |
1728 | | - "<IPython.core.display.HTML at 0x117f224e0>" |
| 1732 | + "<IPython.core.display.HTML at 0x103fc0ac8>" |
1729 | 1733 | ] |
1730 | 1734 | } |
1731 | 1735 | ], |
1732 | | - "prompt_number": 370 |
| 1736 | + "prompt_number": 10 |
1733 | 1737 | }, |
1734 | 1738 | { |
1735 | 1739 | "cell_type": "markdown", |
|
0 commit comments