hparam_presets = {
    'lstm-benchmark': {
        'model': dict(
            bidirectional_encoder=True,
            d_model=1024,
            d_embedding=512,
            grad_clip_threshold=150.0,
            init_scale=None,
            kl_weight_start=0.2,
            kl_annealing_steps=8000,
            latent_depth=64,
            lr=3e-4,
            tie_embedding_weights=True,
            tie_logit_weights=True,
            transformer_encoder=False
        ),
        'trainer': dict(
            accumulate_grad_batches=2
        )
    },
    'lstm-wikipedia': {
        'data': dict(
            dataset_name='wikipedia',
            dataset_config='20200501.en',
            tokens_per_batch=50_000,
            min_tokens_per_sample=512,
            max_tokens_per_sample=25_000
        ),
        'model': dict(
            bidirectional_encoder=True,
            d_model=2048,
            d_embedding=512,
            grad_clip_threshold=150.0,
            init_scale=None,
            kl_weight_start=1.0,
            kl_annealing_steps=0,
            latent_depth=64,
            lr=3e-4,
            tie_embedding_weights=True,
            tie_logit_weights=True,
            transformer_encoder=False
        ),
        'trainer': dict(
            accumulate_grad_batches=2,
            val_check_interval=0.25
        )
    },
    'dense-benchmark': {
        'data': dict(
            dataset_name='wikipedia',
            dataset_config='20200501.en',
            tokens_per_batch=50_000,
            min_tokens_per_sample=512,
            max_tokens_per_sample=3_125
        ),
        'model': dict(
            d_model=512,
            grad_checkpointing=True,
            grad_clip_threshold=150.0,
            init_scale=0.02,
            kl_weight_start=0.3,
            kl_weight_end=1.0,
            kl_annealing_steps=8000,
            latent_depth=64,
            lr=3e-4,
            num_layers=6,
            sparse_self_attention=False,
            tie_embedding_weights=True
        ),
        'trainer': dict(
            accumulate_grad_batches=2
        )
    },
    'sparse-benchmark': {
        'data': dict(
            dataset_name='wikipedia',
            dataset_config='20200501.en',
            tokens_per_batch=50_000,
            min_tokens_per_sample=512,
            max_tokens_per_sample=3_125
        ),
        'model': dict(
            d_model=512,
            grad_checkpointing=True,
            grad_clip_threshold=150.0,
            init_scale=0.02,
            kl_weight_start=1.0,
            kl_annealing_steps=0,
            latent_depth=64,
            lr=3e-4,
            num_layers=6,
            sparse_self_attention=True,
            tie_embedding_weights=True
        ),
        'trainer': dict(
            accumulate_grad_batches=2
        )
    },
    'nonvae-wikipedia': {
        'data': dict(
            dataset_name='wikipedia',
            dataset_config='20200501.en',
            tokens_per_batch=50_000,
            min_tokens_per_sample=512,
            max_tokens_per_sample=3_125
        ),
        'model': dict(
            d_model=512,
            grad_checkpointing=True,
            grad_clip_threshold=150.0,
            init_scale=0.02,
            lr=3e-4,
            num_layers=6,
            sparse_self_attention=False,
            tie_embedding_weights=True
        ),
        'trainer': dict(
            accumulate_grad_batches=2,
            val_check_interval=0.1
        )
    },
    'wikipedia': {
        'data': dict(
            dataset_name='wikipedia',
            dataset_config='20200501.en',
            tokens_per_batch=100_000,
            min_tokens_per_sample=512,
            max_tokens_per_sample=50_000
        ),
        'model': dict(
            d_model=512,
            grad_checkpointing=True,
            grad_clip_threshold=150.0,
            init_scale=0.02,
            attn_window_size=8,
            kl_weight_start=0.1,
            kl_weight_end=1.0,
            kl_annealing_steps=8000,
            latent_depth=64,
            lr=3e-4,
            num_layers=6,
            sparse_self_attention=True,
            tie_embedding_weights=True
        ),
        'trainer': dict(
            accumulate_grad_batches=2,
            val_check_interval=0.1
        )
    },
    'pg19': {
        'data': dict(
            dataset_name='pg19',
            dataset_config=None,
            tokens_per_batch=102_912,
            min_tokens_per_sample=512,
            max_tokens_per_sample=102_400
        ),
        'model': dict(
            # adam_beta1=0.95,
            d_model=512,
            grad_checkpointing=True,
            grad_clip_threshold=150.0,
            init_scale=0.02,
            attn_window_size=6,
            kl_weight_start=0.1,
            kl_weight_end=1.0,
            kl_annealing_steps=8000,
            latent_depth=64,
            lr=3e-4,
            num_layers=6,
            sparse_self_attention=True,
            tie_embedding_weights=True
        ),
        'trainer': dict(
            accumulate_grad_batches=4,
            val_check_interval=0.5
        )
    },
    'nonvae-pg19': {
        'data': dict(
            dataset_name='pg19',
            dataset_config=None,
            tokens_per_batch=92_672,
            min_tokens_per_sample=512,
            max_tokens_per_sample=92_160
        ),
        'model': dict(
            d_model=512,
            grad_checkpointing=True,
            grad_clip_threshold=150.0,
            init_scale=0.02,
            lr=3e-4,
            num_layers=6,
            sparse_self_attention=True,
            tie_embedding_weights=True
        ),
        'trainer': dict(
            accumulate_grad_batches=4,
            val_check_interval=0.5
        )
    }
}