d_model | d_mlp | d_head | d_vocab | act_fn | n_heads | n_layers | n_ctx | n_params | positional_embedding_type | |
---|---|---|---|---|---|---|---|---|---|---|
gpt2-small | 768 | 3072 | 64 | 50257 | gelu | 12 | 12 | 1024 | 85M | standard |
gpt2-medium | 1024 | 4096 | 64 | 50257 | gelu | 16 | 24 | 1024 | 302M | standard |
gpt2-large | 1280 | 5120 | 64 | 50257 | gelu | 20 | 36 | 1024 | 708M | standard |
gpt2-xl | 1600 | 6400 | 64 | 50257 | gelu | 25 | 48 | 1024 | 1.5B | standard |
distillgpt2 | 768 | 3072 | 64 | 50257 | gelu | 12 | 6 | 1024 | 42M | standard |
opt-125m | 768 | 3072 | 64 | 50272 | relu | 12 | 12 | 2048 | 85M | standard |
opt-1.3b | 2048 | 8192 | 64 | 50272 | relu | 32 | 24 | 2048 | 1.2B | standard |
opt-2.7b | 2560 | 10240 | 80 | 50272 | relu | 32 | 32 | 2048 | 2.5B | standard |
opt-6.7b | 4096 | 16384 | 128 | 50272 | relu | 32 | 32 | 2048 | 6.4B | standard |
opt-13b | 5120 | 20480 | 128 | 50272 | relu | 40 | 40 | 2048 | 13B | standard |
opt-30b | 7168 | 28672 | 128 | 50272 | relu | 56 | 48 | 2048 | 30B | standard |
opt-66b | 9216 | 36864 | 128 | 50272 | relu | 72 | 64 | 2048 | 65B | standard |
gpt-neo-125M | 768 | 3072 | 64 | 50257 | gelu | 12 | 12 | 2048 | 85M | standard |
gpt-neo-1.3B | 2048 | 8192 | 128 | 50257 | gelu | 16 | 24 | 2048 | 1.2B | standard |
gpt-neo-2.7B | 2560 | 10240 | 128 | 50257 | gelu | 20 | 32 | 2048 | 2.5B | standard |
gpt-j-6B | 4096 | 16384 | 256 | 50400 | gelu | 16 | 28 | 2048 | 5.6B | rotary |
gpt-neox-20b | 6144 | 24576 | 96 | 50432 | gelu_fast | 64 | 44 | 2048 | 20B | rotary |
stanford-gpt2-small-a | 768 | 3072 | 64 | 50257 | gelu | 12 | 12 | 1024 | 85M | standard |
stanford-gpt2-small-b | 768 | 3072 | 64 | 50257 | gelu | 12 | 12 | 1024 | 85M | standard |
stanford-gpt2-small-c | 768 | 3072 | 64 | 50257 | gelu | 12 | 12 | 1024 | 85M | standard |
stanford-gpt2-small-d | 768 | 3072 | 64 | 50257 | gelu | 12 | 12 | 1024 | 85M | standard |
stanford-gpt2-small-e | 768 | 3072 | 64 | 50257 | gelu | 12 | 12 | 1024 | 85M | standard |
stanford-gpt2-medium-a | 1024 | 4096 | 64 | 50257 | gelu | 16 | 24 | 1024 | 302M | standard |
stanford-gpt2-medium-b | 1024 | 4096 | 64 | 50257 | gelu | 16 | 24 | 1024 | 302M | standard |
stanford-gpt2-medium-c | 1024 | 4096 | 64 | 50257 | gelu | 16 | 24 | 1024 | 302M | standard |
stanford-gpt2-medium-d | 1024 | 4096 | 64 | 50257 | gelu | 16 | 24 | 1024 | 302M | standard |
stanford-gpt2-medium-e | 1024 | 4096 | 64 | 50257 | gelu | 16 | 24 | 1024 | 302M | standard |
pythia-70m | 512 | 2048 | 64 | 50304 | gelu | 8 | 6 | 2048 | 19M | rotary |
pythia-160m | 768 | 3072 | 64 | 50304 | gelu | 12 | 12 | 2048 | 85M | rotary |
pythia-410m | 1024 | 4096 | 64 | 50304 | gelu | 16 | 24 | 2048 | 302M | rotary |
pythia-1b | 2048 | 8192 | 256 | 50304 | gelu | 8 | 16 | 2048 | 805M | rotary |
pythia-1.4b | 2048 | 8192 | 128 | 50304 | gelu | 16 | 24 | 2048 | 1.2B | rotary |
pythia-2.8b | 2560 | 10240 | 80 | 50304 | gelu | 32 | 32 | 2048 | 2.5B | rotary |
pythia-6.9b | 4096 | 16384 | 128 | 50432 | gelu | 32 | 32 | 2048 | 6.4B | rotary |
pythia-12b | 5120 | 20480 | 128 | 50688 | gelu | 40 | 36 | 2048 | 11B | rotary |
pythia-70m-deduped | 512 | 2048 | 64 | 50304 | gelu | 8 | 6 | 2048 | 19M | rotary |
pythia-160m-deduped | 768 | 3072 | 64 | 50304 | gelu | 12 | 12 | 2048 | 85M | rotary |
pythia-410m-deduped | 1024 | 4096 | 64 | 50304 | gelu | 16 | 24 | 2048 | 302M | rotary |
pythia-1b-deduped | 2048 | 8192 | 256 | 50304 | gelu | 8 | 16 | 2048 | 805M | rotary |
pythia-1.4b-deduped | 2048 | 8192 | 128 | 50304 | gelu | 16 | 24 | 2048 | 1.2B | rotary |
pythia-2.8b-deduped | 2560 | 10240 | 80 | 50304 | gelu | 32 | 32 | 2048 | 2.5B | rotary |
pythia-6.9b-deduped | 4096 | 16384 | 128 | 50432 | gelu | 32 | 32 | 2048 | 6.4B | rotary |
pythia-12b-deduped | 5120 | 20480 | 128 | 50688 | gelu | 40 | 36 | 2048 | 11B | rotary |
solu-1l-old | 1024 | 4096 | 64 | 50278 | solu | 16 | 1 | 1024 | 13M | standard |
solu-2l-old | 736 | 2944 | 64 | 50278 | solu | 11 | 2 | 1024 | 13M | standard |
solu-4l-old | 512 | 2048 | 64 | 50278 | solu | 8 | 4 | 1024 | 13M | standard |
solu-6l-old | 768 | 3072 | 64 | 50278 | solu | 12 | 6 | 1024 | 42M | standard |
solu-8l-old | 1024 | 4096 | 64 | 50278 | solu | 16 | 8 | 1024 | 101M | standard |
solu-10l-old | 1280 | 5120 | 64 | 50278 | solu | 20 | 10 | 1024 | 197M | standard |
solu-12l-old | 1536 | 6144 | 64 | 50278 | solu | 24 | 12 | 1024 | 340M | standard |
solu-1l | 512 | 2048 | 64 | 48262 | solu | 8 | 1 | 1024 | 3.1M | standard |
solu-2l | 512 | 2048 | 64 | 48262 | solu | 8 | 2 | 1024 | 6.3M | standard |
solu-3l | 512 | 2048 | 64 | 48262 | solu | 8 | 3 | 1024 | 9.4M | standard |
solu-4l | 512 | 2048 | 64 | 48262 | solu | 8 | 4 | 1024 | 13M | standard |
solu-6l | 768 | 3072 | 64 | 48262 | solu | 12 | 6 | 1024 | 42M | standard |
solu-8l | 1024 | 4096 | 64 | 48262 | solu | 16 | 8 | 1024 | 101M | standard |
solu-10l | 1280 | 5120 | 64 | 48262 | solu | 20 | 10 | 1024 | 197M | standard |
solu-12l | 1536 | 6144 | 64 | 48262 | solu | 24 | 12 | 1024 | 340M | standard |
gelu-1l | 512 | 2048 | 64 | 48262 | gelu | 8 | 1 | 1024 | 3.1M | standard |
gelu-2l | 512 | 2048 | 64 | 48262 | gelu | 8 | 2 | 1024 | 6.3M | standard |
gelu-3l | 512 | 2048 | 64 | 48262 | gelu | 8 | 3 | 1024 | 9.4M | standard |
gelu-4l | 512 | 2048 | 64 | 48262 | gelu | 8 | 4 | 1024 | 13M | standard |
attn-only-1l | 512 | 2048 | 64 | 48262 | attn_only | 8 | 1 | 1024 | 1.0M | standard |
attn-only-2l | 512 | 2048 | 64 | 48262 | attn_only | 8 | 2 | 1024 | 2.1M | standard |
attn-only-3l | 512 | 2048 | 64 | 48262 | attn_only | 8 | 3 | 1024 | 3.1M | standard |
attn-only-4l | 512 | 2048 | 64 | 48262 | attn_only | 8 | 4 | 1024 | 4.2M | standard |
attn-only-2l-demo | 512 | 2048 | 64 | 50277 | attn_only | 8 | 2 | 1024 | 2.1M | shortformer |
solu-1l-wiki | 512 | 2048 | 64 | 48262 | solu | 8 | 1 | 1024 | 3.1M | standard |
solu-4l-wiki | 512 | 2048 | 64 | 48262 | solu | 8 | 4 | 1024 | 13M | standard |