“X-VLA: Soft-Prompted Transformer as Scalable Cross-Embodiment Vision-Language-Action Model”

X-VLA

Training

lerobot-train \
  --dataset.repo_id=Gongsta/koch-tshirt-folding-v3 \
  --dataset.video_backend=pyav \
  --output_dir=./outputs/xvla_training \
  --job_name=xvla_training \
  --policy.path="lerobot/xvla-base" \
  --policy.repo_id="Gongsta/xvla-koch-tshirt" \
  --policy.dtype=bfloat16 \
  --policy.action_mode=auto \
  --policy.max_action_dim=20 \
  --steps=20000 \
  --save_freq=2000 \
  --policy.device=cuda \
  --wandb.enable=true \
  --policy.freeze_vision_encoder=false \
  --policy.freeze_language_encoder=false \
  --policy.train_policy_transformer=true \
  --policy.train_soft_prompts=true \
  --policy.num_image_views=3 \
  --policy.input_features=null \

Resume

lerobot-train \
  --dataset.repo_id=Gongsta/koch-tshirt-folding-v3 \
  --dataset.video_backend=pyav \
  --output_dir=./outputs/xvla_training \
  --job_name=xvla_training \
  --policy.repo_id="Gongsta/xvla-koch-tshirt" \
  --policy.dtype=bfloat16 \
  --policy.action_mode=auto \
  --policy.max_action_dim=20 \
  --steps=20000 \
  --save_freq=2000 \
  --policy.device=cuda \
  --wandb.enable=true \
  --policy.freeze_vision_encoder=false \
  --policy.freeze_language_encoder=false \
  --policy.train_policy_transformer=true \
  --policy.train_soft_prompts=true \
  --policy.num_image_views=3 \
  --policy.input_features=null \
  --resume=true \
  --config_path=outputs/xvla_training/checkpoints/last/pretrained_model/train_config.json

Inference

HF_USER=Gongsta lerobot-record \
    --robot.type=bi_koch_follower \
    --robot.left_arm_port=$FOLLOWER_LEFT_PORT \
    --robot.right_arm_port=$FOLLOWER_RIGHT_PORT \
    --robot.id=bimanual_follower \
    --robot.cameras="{ top: {type: opencv, index_or_path: $TOP_CAMERA_INDEX_OR_PATH, width: 640, height: 480, fps: 30}, left_wrist: {type: opencv, index_or_path: $LEFT_WRIST_CAMERA_INDEX_OR_PATH, width: 640, height: 480, fps: 30}, right_wrist: {type: opencv, index_or_path: $RIGHT_WRIST_CAMERA_INDEX_OR_PATH, width: 640, height: 480, fps: 30} }" \
    --dataset.repo_id=${HF_USER}/eval_$(date +%Y-%m-%d_%H-%M-%S) \
    --dataset.single_task="Fold the t-shirt and put it in the bin" \
      --display_data=true \
    --policy.path=./outputs/xvla_training/checkpoints/last/pretrained_model/ \
    --dataset.push_to_hub=False

🛠️ Steven Gong

Table of Contents

“X-VLA: Soft-Prompted Transformer as Scalable Cross-Embodiment Vision-Language-Action Model”

X-VLA

Graph View

Backlinks