neetcode-gh
diff --git a/‎articles/backpropagation.md‎
Lines changed: 9 additions & 0 deletions b/‎articles/backpropagation.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎articles/basics-of-pytorch.md‎
Lines changed: 9 additions & 0 deletions b/‎articles/basics-of-pytorch.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎articles/build-vocabulary.md‎
Lines changed: 9 additions & 0 deletions b/‎articles/build-vocabulary.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎articles/code-gpt.md‎
Lines changed: 9 additions & 0 deletions b/‎articles/code-gpt.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎articles/cross-entropy-loss.md‎
Lines changed: 9 additions & 0 deletions b/‎articles/cross-entropy-loss.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎articles/gpt-data-loader.md‎
Lines changed: 9 additions & 0 deletions b/‎articles/gpt-data-loader.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎articles/gpt-dataset.md‎
Lines changed: 9 additions & 0 deletions b/‎articles/gpt-dataset.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎articles/gradient-descent.md‎
Lines changed: 6 additions & 0 deletions b/‎articles/gradient-descent.md‎
Lines changed: 6 additions & 0 deletions
@@ -37,6 +37,7 @@ We run the forward pass to get $\hat{y}$, compute the delta term (error times si
 
 ### Implementation
 
+::tabs-start
 ```python
 import numpy as np
 from numpy.typing import NDArray
@@ -57,6 +58,8 @@ class Solution:
 
         return (dL_dw, dL_db)
 ```
+::tabs-end
+
 
 ### Walkthrough
 
@@ -87,18 +90,22 @@ The negative gradients mean: increase $w_0$, increase $w_1$, and increase $b$ to
 
 The error is $\hat{y} - y$, not $y - \hat{y}$. Flipping it negates all gradients, making the model move away from the target.
 
+::tabs-start
 ```python
 # Wrong: inverted error
 error = y_true - y_hat
 
 # Correct: prediction minus truth
 error = y_hat - y_true
 ```
+::tabs-end
+
 
 ### Forgetting the Sigmoid Derivative
 
 The sigmoid derivative is part of the chain. Without it, you are computing the gradient as if the activation were linear, which gives wrong weight updates.
 
+::tabs-start
 ```python
 # Wrong: missing sigmoid derivative in the chain
 delta = error  # only the error, no activation derivative
@@ -107,6 +114,8 @@ delta = error  # only the error, no activation derivative
 sigmoid_deriv = y_hat * (1.0 - y_hat)
 delta = error * sigmoid_deriv
 ```
+::tabs-end
+
 
 ---
 
 
@@ -29,6 +29,7 @@ Each method exercises a core PyTorch operation. We use `torch.reshape` for resha
 
 ### Implementation
 
+::tabs-start
 ```python
 import torch
 import torch.nn
@@ -53,6 +54,8 @@ class Solution:
         loss = torch.nn.functional.mse_loss(prediction, target)
         return torch.round(loss, decimals=4)
 ```
+::tabs-end
+
 
 ### Walkthrough
 
@@ -76,25 +79,31 @@ class Solution:
 
 `dim=0` averages across rows (column-wise means), `dim=1` averages across columns (row-wise means). These are easy to confuse.
 
+::tabs-start
 ```python
 # Wrong: averages across columns instead of rows
 averaged = torch.mean(to_avg, dim=1)
 
 # Correct: averages across rows (column means)
 averaged = torch.mean(to_avg, dim=0)
 ```
+::tabs-end
+
 
 ### Mismatched Shapes for Concatenation
 
 Concatenation along `dim=1` requires the same number of rows. Different row counts cause a runtime error.
 
+::tabs-start
 ```python
 # Wrong: different number of rows (2 vs 3)
 torch.cat((torch.zeros(2, 3), torch.zeros(3, 3)), dim=1)
 
 # Correct: same number of rows
 torch.cat((torch.zeros(2, 3), torch.zeros(2, 3)), dim=1)
 ```
+::tabs-end
+
 
 ---
 
 
@@ -32,6 +32,7 @@ Extract unique characters with `set()`, sort them, build two dictionaries with e
 
 ### Implementation
 
+::tabs-start
 ```python
 from typing import Dict, List, Tuple
 
@@ -48,6 +49,8 @@ class Solution:
     def decode(self, ids: List[int], itos: Dict[int, str]) -> str:
         return ''.join(itos[i] for i in ids)
 ```
+::tabs-end
+
 
 ### Walkthrough
 
@@ -77,25 +80,31 @@ Round-trip: `decode(encode("hello")) = "hello"`.
 
 Python sets have no guaranteed iteration order. Without sorting, the same text may produce different vocabularies on different runs.
 
+::tabs-start
 ```python
 # Wrong: non-deterministic order
 chars = list(set(text))
 
 # Correct: sorted for reproducibility
 chars = sorted(set(text))
 ```
+::tabs-end
+
 
 ### Building itos Incorrectly
 
 The `itos` mapping must be the exact inverse of `stoi`. Building it independently can introduce mismatches.
 
+::tabs-start
 ```python
 # Wrong: building independently, might not be exact inverse
 itos = {i: ch for i, ch in enumerate(chars)}
 
 # Correct: derive from stoi to guarantee inverse relationship
 itos = {i: ch for ch, i in stoi.items()}
 ```
+::tabs-end
+
 
 ---
 
 
@@ -33,6 +33,7 @@ Compose all previously built components: embedding layers, a sequence of transfo
 
 ### Implementation
 
+::tabs-start
 ```python
 import torch
 import torch.nn as nn
@@ -135,6 +136,8 @@ class GPT(nn.Module):
             embedded = embedded + self.linear_network(self.second_norm(embedded)) # another skip connection
             return embedded
 ```
+::tabs-end
+
 
 ### Walkthrough
 
@@ -166,6 +169,7 @@ Each of the 5 positions outputs a distribution over 100 tokens, predicting the n
 
 Without position embeddings, the model has no way to distinguish "cat sat" from "sat cat." The representations would be identical.
 
+::tabs-start
 ```python
 # Wrong: no position information
 embedded = self.word_embeddings(context)
@@ -177,11 +181,14 @@ positions = torch.arange(context.shape[1], device=context.device)
 embedded = embedded + self.position_embeddings(positions)
 output = self.transformer_blocks(embedded)
 ```
+::tabs-end
+
 
 ### Using nn.ModuleList Instead of nn.Sequential for Blocks
 
 `nn.Sequential` chains modules automatically in `forward`. `nn.ModuleList` requires you to write the loop yourself. Both register parameters, but Sequential is cleaner here.
 
+::tabs-start
 ```python
 # Works but requires manual loop
 self.blocks = nn.ModuleList([TransformerBlock(...) for _ in range(N)])
@@ -191,6 +198,8 @@ self.blocks = nn.ModuleList([TransformerBlock(...) for _ in range(N)])
 self.blocks = nn.Sequential(*[TransformerBlock(...) for _ in range(N)])
 # forward: x = self.blocks(x)
 ```
+::tabs-end
+
 
 ---
 
 
@@ -36,6 +36,7 @@ For binary cross-entropy, we apply the formula directly: clip predictions with e
 
 ### Implementation
 
+::tabs-start
 ```python
 import numpy as np
 from numpy.typing import NDArray
@@ -55,6 +56,8 @@ class Solution:
         loss = -np.mean(np.sum(y_true * np.log(y_pred), axis=1))
         return round(loss, 4)
 ```
+::tabs-end
+
 
 ### Walkthrough
 
@@ -90,6 +93,7 @@ Average: $(0.35667 + 0.22314) / 2 = 0.28991$
 
 Without epsilon clipping, $\log(0)$ produces $-\infty$ and breaks training.
 
+::tabs-start
 ```python
 # Wrong: log(0) is undefined
 loss = -np.mean(y_true * np.log(y_pred))
@@ -98,18 +102,23 @@ loss = -np.mean(y_true * np.log(y_pred))
 y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
 loss = -np.mean(y_true * np.log(y_pred))
 ```
+::tabs-end
+
 
 ### Mixing Up Binary and Categorical
 
 Binary cross-entropy expects 1D arrays (one probability per sample). Categorical expects 2D arrays (one probability per class per sample). Using the wrong one silently produces wrong gradients.
 
+::tabs-start
 ```python
 # Wrong: using BCE formula on one-hot encoded multi-class data
 loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
 
 # Correct: for multi-class, sum over classes first, then average over samples
 loss = -np.mean(np.sum(y_true * np.log(y_pred), axis=1))
 ```
+::tabs-end
+
 
 ---
 
 
@@ -32,6 +32,7 @@ Sample random starting indices with `torch.randint`, then for each index use ten
 
 ### Implementation
 
+::tabs-start
 ```python
 import torch
 from torchtyping import TensorType
@@ -45,6 +46,8 @@ class Solution:
         y = torch.stack([data[i + 1:i + 1 + context_length] for i in ix])
         return x, y
 ```
+::tabs-end
+
 
 ### Walkthrough
 
@@ -73,25 +76,31 @@ At position 0 of batch 0, the model sees $[20]$ and must predict $30$. At positi
 
 If you sample from $[0, \text{len}(\text{data}))$ instead of $[0, \text{len}(\text{data}) - C)$, starting positions near the end will cause index-out-of-bounds when extracting the target window.
 
+::tabs-start
 ```python
 # Wrong: index can be too large, y slice goes past end
 ix = torch.randint(len(data), (batch_size,))
 
 # Correct: ensure room for context_length + 1 tokens
 ix = torch.randint(len(data) - context_length, (batch_size,))
 ```
+::tabs-end
+
 
 ### Forgetting the +1 Offset for Targets
 
 The target window starts one position after the input window. Without the offset, input and target are identical and the model learns nothing.
 
+::tabs-start
 ```python
 # Wrong: target same as input
 y = torch.stack([data[i:i + context_length] for i in ix])
 
 # Correct: target shifted by 1
 y = torch.stack([data[i + 1:i + 1 + context_length] for i in ix])
 ```
+::tabs-end
+
 
 ---
 
 
@@ -33,6 +33,7 @@ Split the raw text into words, sample random starting positions, and extract con
 
 ### Implementation
 
+::tabs-start
 ```python
 import torch
 from typing import List, Tuple
@@ -49,6 +50,8 @@ class Solution:
             Y.append(tokenized[idx+1:idx+1+context_length])
         return X, Y
 ```
+::tabs-end
+
 
 ### Walkthrough
 
@@ -77,6 +80,7 @@ Each target word is the next word after the corresponding input position.
 
 The problem uses `torch.manual_seed(0)` for reproducibility. Using `random.randint` instead produces different indices and fails the test cases.
 
+::tabs-start
 ```python
 # Wrong: different RNG, non-reproducible
 import random
@@ -87,11 +91,14 @@ indices = [random.randint(0, len(tokenized) - context_length - 1) for _ in range
 torch.manual_seed(0)
 indices = torch.randint(low=0, high=len(tokenized) - context_length, size=(batch_size,)).tolist()
 ```
+::tabs-end
+
 
 ### Forgetting to Convert Tensor Indices to Python List
 
 `torch.randint` returns a tensor. Using it directly for list slicing works, but `.tolist()` makes the code clearer and avoids potential type issues.
 
+::tabs-start
 ```python
 # Works but less clear
 indices = torch.randint(low=0, high=n, size=(batch_size,))
@@ -101,6 +108,8 @@ for idx in indices:
 # Better: explicit conversion
 indices = torch.randint(low=0, high=n, size=(batch_size,)).tolist()
 ```
+::tabs-end
+
 
 ---
 
 
@@ -29,6 +29,7 @@ We start at some initial value and repeatedly apply the update rule. Each iterat
 
 ### Implementation
 
+::tabs-start
 ```python
 class Solution:
     def get_minimizer(self, iterations: int, learning_rate: float, init: int) -> float:
@@ -40,6 +41,8 @@ class Solution:
 
         return round(minimizer, 5)
 ```
+::tabs-end
+
 
 ### Walkthrough
 
@@ -66,6 +69,7 @@ Each step multiplies $x$ by $(1 - 2\alpha) = 0.8$, so convergence is geometric.
 
 A common mistake is computing the derivative but not actually subtracting it from the current value:
 
+::tabs-start
 ```python
 # Wrong: derivative computed but minimizer never changes
 derivative = 2 * minimizer
@@ -75,6 +79,8 @@ derivative = 2 * minimizer
 derivative = 2 * minimizer
 minimizer = minimizer - learning_rate * derivative
 ```
+::tabs-end
+
 
 ### Using the Wrong Derivative