config/shape/man_wonder_bat_spider.yaml

# CUDA_VISIBLE_DEVICES=1 python test_fatezero.py --config config/shape/man_wonder_bat_spider.yaml
pretrained_model_path: "ckpt/man_skate_250"

dataset_config:
    path: "data/shape/man_skate"
    prompt: "A man rides a wooden skateboard on the rail with a helmet and arms outstretched"
    n_sample_frame: 8
    sampling_rate: 1
    stride: 80
    offset: 
        left: 0
        right: 0
        top: 0
        bottom: 0

editing_config:
    use_invertion_latents: true
    use_inversion_attention: true
    guidance_scale: 7.5
    editing_prompts: [
        # source prompt
        A man rides a wooden skateboard on the rail with a helmet and arms outstretched,

        # foreground color and species
        A Wonder Woman rides a wooden skateboard on the rail with cowboy hat and arms outstretched,
        A Spider-Man  rides a wooden skateboard on the rail and arms outstretched,
        A Batman rides a wooden skateboard on the rail and arms outstretched
        
    ]
    p2p_config:
        0: 
            # Whether to directly copy the cross attention from source 
            # True: directly copy, better for object replacement
            # False: keep source attention, better for style
            is_replace_controller: False

            # Semantic preserving and replacement Debug me
            cross_replace_steps: 
                default_: 0.8

            # Source background structure preserving, in [0, 1]. 
            # e.g., =0.6 Replace the first 60% steps self-attention
            self_replace_steps: 0.6

            
            # Amplify the target-words cross attention, larger value, more close to target
            # eq_params: 
            #     words: ["silver", "sculpture"]
            #     values: [2,2] 
            
            # Target structure-divergence hyperparames
            # If you change the shape of object better to use all three line, otherwise, no need.
            # Without following three lines, all self-attention will be replaced
            # blend_words: [['cat',], ["cat",]] 
            # blend_self_attention:  True
            # # blend_latents: False   # performance not so good in our case, need debug            
            # blend_th: [2, 2]
            # preserve source structure of blend_words , [0, 1]
            # default is blend_th: [2, 2]  # preserve all source self-attention 
            # blend_th : [0.0, 0.0], mask -> 1, use more target self-attention att_replace, more generated attention, less source acttention
            
        1:
            is_replace_controller: False
            cross_replace_steps: 
                default_: 0.3
            self_replace_steps: 0.3
            
            blend_words: [['man',]] 
            blend_self_attention:  True
            # blend_latents: False   # performance not so good in our case, need debug            
            blend_th: [0.3, 0.3]

        2:
            is_replace_controller: False
            cross_replace_steps: 
                default_: 0.3
            self_replace_steps: 0.3
            
            blend_words: [['man',]] 
            blend_self_attention:  True
            # blend_latents: False   # performance not so good in our case, need debug            
            blend_th: [0.3, 0.3]
        
        3:
            is_replace_controller: False
            cross_replace_steps: 
                default_: 0.9
            self_replace_steps: 0.9
            
            blend_words: [['man',]] 
            blend_self_attention:  True
            # blend_latents: False   # performance not so good in our case, need debug            
            blend_th: [0.3, 0.3]


    clip_length: "${..dataset_config.n_sample_frame}"
    sample_seeds: [0]
    

    num_inference_steps: 50
    prompt2prompt_edit: True

    
model_config:
    lora: 16
    # temporal_downsample_time: 4
    # SparseCausalAttention_index: ['mid'] 
    # least_sc_channel: 1280
    # least_sc_channel: 100000

test_pipeline_config:
    target: video_diffusion.pipelines.p2p_ddim_spatial_temporal.P2pDDIMSpatioTemporalPipeline
    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"


seed: 0