“X-VLA: Soft-Prompted Transformer as Scalable Cross-Embodiment Vision-Language-Action Model”
X-VLA
Training
lerobot-train \
--dataset.repo_id=Gongsta/koch-tshirt-folding-v3 \
--dataset.video_backend=pyav \
--output_dir=./outputs/xvla_training \
--job_name=xvla_training \
--policy.path="lerobot/xvla-base" \
--policy.repo_id="Gongsta/xvla-koch-tshirt" \
--policy.dtype=bfloat16 \
--policy.action_mode=auto \
--policy.max_action_dim=20 \
--steps=20000 \
--save_freq=2000 \
--policy.device=cuda \
--wandb.enable=true \
--policy.freeze_vision_encoder=false \
--policy.freeze_language_encoder=false \
--policy.train_policy_transformer=true \
--policy.train_soft_prompts=true \
--policy.num_image_views=3 \
--policy.input_features=null \
Resume
lerobot-train \
--dataset.repo_id=Gongsta/koch-tshirt-folding-v3 \
--dataset.video_backend=pyav \
--output_dir=./outputs/xvla_training \
--job_name=xvla_training \
--policy.repo_id="Gongsta/xvla-koch-tshirt" \
--policy.dtype=bfloat16 \
--policy.action_mode=auto \
--policy.max_action_dim=20 \
--steps=20000 \
--save_freq=2000 \
--policy.device=cuda \
--wandb.enable=true \
--policy.freeze_vision_encoder=false \
--policy.freeze_language_encoder=false \
--policy.train_policy_transformer=true \
--policy.train_soft_prompts=true \
--policy.num_image_views=3 \
--policy.input_features=null \
--resume=true \
--config_path=outputs/xvla_training/checkpoints/last/pretrained_model/train_config.json
Inference
HF_USER=Gongsta lerobot-record \
--robot.type=bi_koch_follower \
--robot.left_arm_port=$FOLLOWER_LEFT_PORT \
--robot.right_arm_port=$FOLLOWER_RIGHT_PORT \
--robot.id=bimanual_follower \
--robot.cameras="{ top: {type: opencv, index_or_path: $TOP_CAMERA_INDEX_OR_PATH, width: 640, height: 480, fps: 30}, left_wrist: {type: opencv, index_or_path: $LEFT_WRIST_CAMERA_INDEX_OR_PATH, width: 640, height: 480, fps: 30}, right_wrist: {type: opencv, index_or_path: $RIGHT_WRIST_CAMERA_INDEX_OR_PATH, width: 640, height: 480, fps: 30} }" \
--dataset.repo_id=${HF_USER}/eval_$(date +%Y-%m-%d_%H-%M-%S) \
--dataset.single_task="Fold the t-shirt and put it in the bin" \
--display_data=true \
--policy.path=./outputs/xvla_training/checkpoints/last/pretrained_model/ \
--dataset.push_to_hub=False