Skip to content

Commit

Permalink
make more accurate the way in which we count parameters. previous cou…
Browse files Browse the repository at this point in the history
…nt incorrectly included the positional encoding params, when typically only the number of weight parameters is reported for these models
  • Loading branch information
karpathy committed Feb 4, 2023
1 parent 3341b4c commit 34720df
Showing 1 changed file with 12 additions and 1 deletion.
13 changes: 12 additions & 1 deletion model.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,19 @@ def __init__(self, config):
torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

# report number of parameters
print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

def get_num_params(self, non_embedding=True):
"""
Return the number of parameters in the model.
For non-embedding count (default), the position embeddings get subtracted.
The token embeddings would too, except due to the parameter sharing these
params are actually used as weights in the final layer, so we include them.
"""
n_params = sum(p.numel() for p in self.parameters())
print("number of parameters: %.2fM" % (n_params/1e6,))
if non_embedding:
n_params -= self.transformer.wpe.weight.numel()
return n_params

def _init_weights(self, module):
if isinstance(module, nn.Linear):
Expand Down

0 comments on commit 34720df

Please sign in to comment.