vurtun · July 12, 2025 11:08
diff --git a/chain_arm.c b/chain_arm.c
 #include <arm_neon.h>
 #include <math.h>
 #include <assert.h>
 #include <string.h>
 #include <float.h>

 enum {
  MAX_CHNS      = 64,
  PTC_PER_CHN   = 16,
  SPH_PER_CHN   = 8,
  MAX_PTC       = (MAX_CHNS * PTC_PER_CHN),
  MAX_SPH       = (MAX_CHNS * SPH_PER_CHN),
  CON_PER_CHN   = PTC_PER_CHN,
  MAX_REST_LEN  = (MAX_CHNS * CON_PER_CHN),
  MAX_ITR       = 16,
 };
 struct chn_cfg {
  float damping;              // [0, 1]
  float stiffness;            // [0, 1]
  float stretch_factor;
  float max_strain;
  float friction;             // [0, 1]
  float linear_inertia;       // [0, 1], 1.0f = physically correct linear motion
  float angular_inertia;      // [0, 1], 1.0f = physically correct angular rotation
  float centrifugal_inertia;  // [0, 1], 1.0f = physically correct centrifugal force
 };
 struct chn_sim {
  unsigned char free_idx_cnt;
  unsigned char free_idx[MAX_CHNS];
  struct chn_cfg chn_cfg[MAX_CHNS] __attribute__((aligned(32)));

  // chain global forces (world space gravity and wind)
  float chn_grav_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_grav_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_grav_z[MAX_CHNS] __attribute__((aligned(32)));

  float chn_wnd_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_wnd_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_wnd_z[MAX_CHNS] __attribute__((aligned(32)));

  // chain transformations (world space)
  float chn_pos_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_pos_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_pos_z[MAX_CHNS] __attribute__((aligned(32)));
  
  float chn_quat_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_quat_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_quat_z[MAX_CHNS] __attribute__((aligned(32)));
  float chn_quat_w[MAX_CHNS] __attribute__((aligned(32)));
  
  float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32)));
  
  float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32)));
  float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32)));

  // particle positions (model space)
  float ptc_pos_x[MAX_PTC] __attribute__((aligned(32)));
  float ptc_pos_y[MAX_PTC] __attribute__((aligned(32)));
  float ptc_pos_z[MAX_PTC] __attribute__((aligned(32)));
  float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32)));
  
  float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32)));
  float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32)));
  float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32)));
  float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32)));

  // sphere positions (model space)
  float sph_x[MAX_SPH] __attribute__((aligned(32)));
  float sph_y[MAX_SPH] __attribute__((aligned(32)));
  float sph_z[MAX_SPH] __attribute__((aligned(32)));
  float sph_r[MAX_SPH] __attribute__((aligned(32)));

  // rest positions (model space)
  float rest_pos_x[MAX_PTC] __attribute__((aligned(32)));
  float rest_pos_y[MAX_PTC] __attribute__((aligned(32)));
  float rest_pos_z[MAX_PTC] __attribute__((aligned(32)));
  float motion_radius[MAX_PTC] __attribute__((aligned(32)));
 };
 extern void
 chn_sim_init(struct chn_sim *cns) {
  assert(cns);
  cns->free_idx_cnt = MAX_CHNS;
  for (int i = 0; i < MAX_CHNS; ++i) {
    cns->free_idx[i] = MAX_CHNS - i - 1;
  }
  for (int i = 0; i < MAX_CHNS; i += 4) {
    vst1q_f32(&cns->chn_quat_w[i], vdupq_n_f32(1.0f));
    vst1q_f32(&cns->chn_prev_quat_w[i], vdupq_n_f32(1.0f));  
  }
 }
 extern int
 chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) {
  assert(cns);
  assert(cfg);
  assert(cns->free_idx_cnt > 0);
  assert(cnt <= PTC_PER_CHN);
  assert(cnt > 1);

  int chn = cns->free_idx[--cns->free_idx_cnt];
  cns->chn_cfg[chn] = *cfg;

  int p_idx = (chn * PTC_PER_CHN);
  int r_idx = (chn * CON_PER_CHN);
  for (int i = 0; i < cnt; ++i) {
    cns->ptc_pos_x[p_idx + i] = rest_pos[i*4+0];
    cns->ptc_pos_y[p_idx + i] = rest_pos[i*4+1];
    cns->ptc_pos_z[p_idx + i] = rest_pos[i*4+2];
    cns->ptc_inv_mass[p_idx + i] = rest_pos[i*4+3];

    cns->rest_pos_x[p_idx + i] = rest_pos[i*4+0];
    cns->rest_pos_y[p_idx + i] = rest_pos[i*4+1];
    cns->rest_pos_z[p_idx + i] = rest_pos[i*4+2];
  }
  for (int i = cnt; i < PTC_PER_CHN; ++i) {
    cns->ptc_pos_x[p_idx + i] = 0.0f;
    cns->ptc_pos_y[p_idx + i] = 0.0f;
    cns->ptc_pos_z[p_idx + i] = 0.0f;
  }
  for (int i = 1; i < cnt; ++i) {
    float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0];
    float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1];
    float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2];
    cns->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz);
  }
  for (int i = cnt; i < CON_PER_CHN; ++i) {
    cns->ptc_rest_len[r_idx + i] = 0.0f;
    cns->ptc_rest_len[r_idx + i] = 0.0f;
    cns->ptc_rest_len[r_idx + i] = 0.0f;
  }
  return chn;
 }
 extern void
 chn_sim_del(struct chn_sim *cns, int chn) {
  assert(cns);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);

  int p_idx = (chn * PTC_PER_CHN);
  int s_idx = (chn * SPH_PER_CHN);
  int c_idx = (chn * CON_PER_CHN);

  cns->free_idx[cns->free_idx_cnt++] = chn;
  memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0]));

  cns->chn_grav_x[chn] = 0.0f;
  cns->chn_grav_y[chn] = 0.0f;
  cns->chn_grav_z[chn] = 0.0f;

  cns->chn_wnd_x[chn] = 0.0f;
  cns->chn_wnd_y[chn] = 0.0f;
  cns->chn_wnd_z[chn] = 0.0f;

  cns->chn_pos_x[chn] = 0.0f;
  cns->chn_pos_y[chn] = 0.0f;
  cns->chn_pos_z[chn] = 0.0f;

  cns->chn_quat_x[chn] = 0.0f;
  cns->chn_quat_y[chn] = 0.0f;
  cns->chn_quat_z[chn] = 0.0f;
  cns->chn_quat_w[chn] = 1.0f;

  cns->chn_prev_pos_x[chn] = 0.0f;
  cns->chn_prev_pos_y[chn] = 0.0f;
  cns->chn_prev_pos_z[chn] = 0.0f;

  cns->chn_prev_quat_x[chn] = 0.0f;
  cns->chn_prev_quat_y[chn] = 0.0f;
  cns->chn_prev_quat_z[chn] = 0.0f;
  cns->chn_prev_quat_w[chn] = 1.0f;

  for (int i = 0; i < PTC_PER_CHN; i += 4) {
    vst1q_f32(&cns->ptc_pos_x[p_idx + i], vdupq_n_f32(0.0f));
    vst1q_f32(&cns->ptc_pos_y[p_idx + i], vdupq_n_f32(0.0f));
    vst1q_f32(&cns->ptc_pos_z[p_idx + i], vdupq_n_f32(0.0f));
    vst1q_f32(&cns->ptc_inv_mass[p_idx + i], vdupq_n_f32(0.0f));

    vst1q_f32(&cns->ptc_prev_pos_x[p_idx + i], vdupq_n_f32(0.0f));
    vst1q_f32(&cns->ptc_prev_pos_y[p_idx + i], vdupq_n_f32(0.0f));
    vst1q_f32(&cns->ptc_prev_pos_z[p_idx + i], vdupq_n_f32(0.0f));

    vst1q_f32(&cns->rest_pos_x[p_idx + i], vdupq_n_f32(0.0f));
    vst1q_f32(&cns->rest_pos_y[p_idx + i], vdupq_n_f32(0.0f));
    vst1q_f32(&cns->rest_pos_z[p_idx + i], vdupq_n_f32(0.0f));
    vst1q_f32(&cns->motion_radius[p_idx + i], vdupq_n_f32(0.0f));
  }
  // Spheres
  vst1q_f32(&cns->sph_x[s_idx], vdupq_n_f32(0.0f));
  vst1q_f32(&cns->sph_y[s_idx], vdupq_n_f32(0.0f));
  vst1q_f32(&cns->sph_z[s_idx], vdupq_n_f32(0.0f));
  vst1q_f32(&cns->sph_r[s_idx], vdupq_n_f32(0.0f));

  vst1q_f32(&cns->sph_x[s_idx + 4], vdupq_n_f32(0.0f));
  vst1q_f32(&cns->sph_y[s_idx + 4], vdupq_n_f32(0.0f));
  vst1q_f32(&cns->sph_z[s_idx + 4], vdupq_n_f32(0.0f));
  vst1q_f32(&cns->sph_r[s_idx + 4], vdupq_n_f32(0.0f));

  // Loop by 4 for NEON
  for (int i = 0; i < CON_PER_CHN; i += 4) {
    vst1q_f32(&cns->ptc_rest_len[c_idx + i], vdupq_n_f32(0.0f));
  }
 }
 extern void
 chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) {
  assert(cns);
  assert(pos3);
  assert(rot4);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);

  cns->chn_pos_x[chn] = pos3[0];
  cns->chn_pos_y[chn] = pos3[1];
  cns->chn_pos_z[chn] = pos3[2];

  cns->chn_quat_x[chn] = rot4[0];
  cns->chn_quat_y[chn] = rot4[1];
  cns->chn_quat_z[chn] = rot4[2];
  cns->chn_quat_w[chn] = rot4[3];
 }
 extern void
 chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) {
  assert(cns);
  assert(pos3);
  assert(rot4);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);

  cns->chn_pos_x[chn] = cns->chn_prev_pos_x[chn] = pos3[0];
  cns->chn_pos_y[chn] = cns->chn_prev_pos_y[chn] = pos3[1];
  cns->chn_pos_z[chn] = cns->chn_prev_pos_z[chn] = pos3[2];

  cns->chn_quat_x[chn] = cns->chn_prev_quat_x[chn] = rot4[0];
  cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1];
  cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2];
  cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3];
 }
 extern void
 chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) {
  assert(cns);
  assert(grav3);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);

  cns->chn_grav_x[chn] = grav3[0];
  cns->chn_grav_y[chn] = grav3[1];
  cns->chn_grav_z[chn] = grav3[2];
 }
 extern void
 chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) {
  assert(cns);
  assert(wind3);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);

  cns->chn_wnd_x[chn] = wind3[0];
  cns->chn_wnd_y[chn] = wind3[1];
  cns->chn_wnd_z[chn] = wind3[2];
 }
 extern void
 chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) {
  assert(cns);
  assert(spheres);
  assert(cnt < MAX_SPH);

  int s_idx = (chn * SPH_PER_CHN);
  for (int i = 0; i < cnt; i++) {
    cns->sph_x[s_idx + i] = spheres[i*4+0];
    cns->sph_y[s_idx + i] = spheres[i*4+1];
    cns->sph_z[s_idx + i] = spheres[i*4+2];
    cns->sph_r[s_idx + i] = spheres[i*4+3];
  }
  for (int i = cnt; i < SPH_PER_CHN; ++i) {
    cns->sph_x[s_idx + i] = 0;
    cns->sph_y[s_idx + i] = 0;
    cns->sph_z[s_idx + i] = 0;
    cns->sph_r[s_idx + i] = 0;
  }
 }
 extern void
 chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) {
  assert(cns);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);
  assert(cnt >= 0);
  assert(cnt < PTC_PER_CHN);

  int p_idx = (chn * PTC_PER_CHN);
  for (int i = 0; i < cnt; ++i) {
    cns->motion_radius[p_idx + i] = radius[i];
  }
  for (int i = cnt; i < PTC_PER_CHN; ++i) {
    cns->motion_radius[p_idx + i] = FLT_MAX;
  }
 }
 extern void
 chn_sim_run(struct chn_sim *cns, float dt) {
  // NEON constants
  const float32x4_t dt_vec = vdupq_n_f32(dt);
  const float32x4_t dt2_vec = vdupq_n_f32(dt * dt);
  const float32x4_t one_vec = vdupq_n_f32(1.0f);
  const float32x4_t neg_one_vec = vdupq_n_f32(-1.0f);
  const float32x4_t eps_vec = vdupq_n_f32(1e-6f);
  const float32x4_t ptc_inv_mass_clamp_min = vdupq_n_f32(0.0f);
  const float32x4_t ptc_inv_mass_clamp_max = vdupq_n_f32(1e6f);
  const float32x4_t zero_vec = vdupq_n_f32(0.0f);
  const float32x4_t inv_dt_vec = vdupq_n_f32(1.0f / dt);
  const float32x4_t two_vec = vdupq_n_f32(2.0f);
  const float32x4_t inertia_clamp_min = vdupq_n_f32(0.0f);
  const float32x4_t inertia_clamp_max = vdupq_n_f32(1.0f);
  // NEON doesn't have a direct equivalent to _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF))
  // for absolute mask. We'll use `vabsq_f32` where needed or reconstruct the logic.

  // Local arrays for PBD solver
  float px_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
  float py_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
  float pz_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
  float pm_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));

  // Initialize dummy elements
  px_lcl[PTC_PER_CHN] = 0.0f;
  py_lcl[PTC_PER_CHN] = 0.0f;
  pz_lcl[PTC_PER_CHN] = 0.0f;
  pm_lcl[PTC_PER_CHN] = 0.0f;

  // Stack arrays for precomputed data
  float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_wnd_lcl_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_wnd_lcl_z[MAX_CHNS] __attribute__((aligned(32)));

  float chn_grav_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_grav_lcl_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_grav_lcl_z[MAX_CHNS] __attribute__((aligned(32)));

  float vel_x[MAX_CHNS] __attribute__((aligned(32)));
  float vel_y[MAX_CHNS] __attribute__((aligned(32)));
  float vel_z[MAX_CHNS] __attribute__((aligned(32)));

  float ang_vel_x[MAX_CHNS] __attribute__((aligned(32)));
  float ang_vel_y[MAX_CHNS] __attribute__((aligned(32)));
  float ang_vel_z[MAX_CHNS] __attribute__((aligned(32)));

  // Step 0: Precomputation
  for (int c = 0; c < MAX_CHNS; c += 4) {
    // Load chain quaternions
    float32x4_t qx = vld1q_f32(&cns->chn_quat_x[c]);
    float32x4_t qy = vld1q_f32(&cns->chn_quat_y[c]);
    float32x4_t qz = vld1q_f32(&cns->chn_quat_z[c]);
    float32x4_t qw = vld1q_f32(&cns->chn_quat_w[c]);

    // Create q_conjugate components (qw remains, qx, qy, qz are negated)
    float32x4_t cqx = vnegq_f32(qx); // -qx
    float32x4_t cqy = vnegq_f32(qy); // -qy
    float32x4_t cqz = vnegq_f32(qz); // -qz

    // Compute local-space wind
    {
      float32x4_t vx = vld1q_f32(&cns->chn_wnd_x[c]);
      float32x4_t vy = vld1q_f32(&cns->chn_wnd_y[c]);
      float32x4_t vz = vld1q_f32(&cns->chn_wnd_z[c]);

      // Step 1: t = q_conj * v_world_as_quat
      // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
      // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
      // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
      // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
      // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
      float32x4_t tx = vmlaq_f32(vmulq_f32(qw, vx), cqy, vz); // qw*vx + (-qy)*vz
      tx = vmlsq_f32(tx, cqz, vy); // qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy

      float32x4_t ty = vmlaq_f32(vmulq_f32(qw, vy), cqz, vx); // qw*vy + (-qz)*vx
      ty = vmlsq_f32(ty, cqx, vz); // qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz

      float32x4_t tz = vmlaq_f32(vmulq_f32(qw, vz), cqx, vy); // qw*vz + (-qx)*vy
      tz = vmlsq_f32(tz, cqy, vx); // qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx

      float32x4_t tw = vnegq_f32(vmlaq_f32(vmulq_f32(cqx, vx), cqy, vy)); // - ((-qx)*vx + (-qy)*vy) = qx*vx + qy*vy
      tw = vmlsq_f32(tw, cqz, vz);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz

      // Step 2: res_vec = (t * q)_vec
      // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
      // res_x = tw*qx + tx*qw + ty*qz - tz*qy
      // res_y = tw*qy + ty*qw + tz*qx - tx*qz
      // res_z = tw*qz + ty*qw + tx*qy - ty*qx
      float32x4_t res_x = vmlaq_f32(vmulq_f32(tw, qx), tx, qw);
      res_x = vmlsq_f32(res_x, tz, qy);
      res_x = vmlaq_f32(res_x, ty, qz); // Corrected: tw*qx + tx*qw + ty*qz - tz*qy

      float32x4_t res_y = vmlaq_f32(vmulq_f32(tw, qy), ty, qw);
      res_y = vmlsq_f32(res_y, tx, qz);
      res_y = vmlaq_f32(res_y, tz, qx); // Corrected: tw*qy + ty*qw + tz*qx - tx*qz

      float32x4_t res_z = vmlaq_f32(vmulq_f32(tw, qz), tz, qw);
      res_z = vmlsq_f32(res_z, ty, qx);
      res_z = vmlaq_f32(res_z, tx, qy); // Corrected: tw*qz + tz*qw + tx*qy - ty*qx

      vst1q_f32(&chn_wnd_lcl_x[c], res_x);
      vst1q_f32(&chn_wnd_lcl_y[c], res_y);
      vst1q_f32(&chn_wnd_lcl_z[c], res_z);
    }
    // Compute local-space gravity
    {
      float32x4_t vx = vld1q_f32(&cns->chn_grav_x[c]);
      float32x4_t vy = vld1q_f32(&cns->chn_grav_y[c]);
      float32x4_t vz = vld1q_f32(&cns->chn_grav_z[c]);

      // Step 1: t = q_conj * v_world_as_quat
      float32x4_t tx = vmlaq_f32(vmulq_f32(qw, vx), cqy, vz);
      tx = vmlsq_f32(tx, cqz, vy);

      float32x4_t ty = vmlaq_f32(vmulq_f32(qw, vy), cqz, vx);
      ty = vmlsq_f32(ty, cqx, vz);

      float32x4_t tz = vmlaq_f32(vmulq_f32(qw, vz), cqx, vy);
      tz = vmlsq_f32(tz, cqy, vx);

      float32x4_t tw = vnegq_f32(vmlaq_f32(vmulq_f32(cqx, vx), cqy, vy));
      tw = vmlsq_f32(tw, cqz, vz);

      // Step 2: res_vec = (t * q)_vec
      float32x4_t res_x = vmlaq_f32(vmulq_f32(tw, qx), tx, qw);
      res_x = vmlsq_f32(res_x, tz, qy);
      res_x = vmlaq_f32(res_x, ty, qz);

      float32x4_t res_y = vmlaq_f32(vmulq_f32(tw, qy), ty, qw);
      res_y = vmlsq_f32(res_y, tx, qz);
      res_y = vmlaq_f32(res_y, tz, qx);

      float32x4_t res_z = vmlaq_f32(vmulq_f32(tw, qz), tz, qw);
      res_z = vmlsq_f32(res_z, ty, qx);
      res_z = vmlaq_f32(res_z, tx, qy);

      vst1q_f32(&chn_grav_lcl_x[c], res_x);
      vst1q_f32(&chn_grav_lcl_y[c], res_y);
      vst1q_f32(&chn_grav_lcl_z[c], res_z);
    }
    // Compute linear velocity
    {
      float32x4_t curr_x = vld1q_f32(&cns->chn_pos_x[c]);
      float32x4_t curr_y = vld1q_f32(&cns->chn_pos_y[c]);
      float32x4_t curr_z = vld1q_f32(&cns->chn_pos_z[c]);

      float32x4_t prev_x = vld1q_f32(&cns->chn_prev_pos_x[c]);
      float32x4_t prev_y = vld1q_f32(&cns->chn_prev_pos_y[c]);
      float32x4_t prev_z = vld1q_f32(&cns->chn_prev_pos_z[c]);

      float32x4_t vel_x_vec = vmulq_f32(vsubq_f32(curr_x, prev_x), inv_dt_vec);
      float32x4_t vel_y_vec = vmulq_f32(vsubq_f32(curr_y, prev_y), inv_dt_vec);
      float32x4_t vel_z_vec = vmulq_f32(vsubq_f32(curr_z, prev_z), inv_dt_vec);

      vst1q_f32(&vel_x[c], vel_x_vec);
      vst1q_f32(&vel_y[c], vel_y_vec);
      vst1q_f32(&vel_z[c], vel_z_vec);
    }
    // Compute angular velocity
    {
      float32x4_t qx = vld1q_f32(&cns->chn_quat_x[c]);
      float32x4_t qy = vld1q_f32(&cns->chn_quat_y[c]);
      float32x4_t qz = vld1q_f32(&cns->chn_quat_z[c]);
      float32x4_t qw = vld1q_f32(&cns->chn_quat_w[c]);

      float32x4_t prev_qx = vld1q_f32(&cns->chn_prev_quat_x[c]);
      float32x4_t prev_qy = vld1q_f32(&cns->chn_prev_quat_y[c]);
      float32x4_t prev_qz = vld1q_f32(&cns->chn_prev_quat_z[c]);
      float32x4_t prev_qw = vld1q_f32(&cns->chn_prev_quat_w[c]);

      // Step 1: Compute delta quaternion (to - from)
      float32x4_t dt_qx = vsubq_f32(qx, prev_qx);
      float32x4_t dt_qy = vsubq_f32(qy, prev_qy);
      float32x4_t dt_qz = vsubq_f32(qz, prev_qz);
      float32x4_t dt_qw = vsubq_f32(qw, prev_qw);

      // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz)
      float32x4_t conj_qx = vnegq_f32(prev_qx);
      float32x4_t conj_qy = vnegq_f32(prev_qy);
      float32x4_t conj_qz = vnegq_f32(prev_qz);
      float32x4_t conj_qw = prev_qw;

      // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
      // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2)
      // Result: orientationDelta = (dt_qw, dt_qx, dt_qy, dt_qz) * (conj_qw, conj_qx, conj_qy, conj_qz)
      float32x4_t orient_qw = vmlsq_f32(vmulq_f32(dt_qw, conj_qw), dt_qx, conj_qx);
      orient_qw = vmlsq_f32(orient_qw, dt_qy, conj_qy);
      orient_qw = vmlsq_f32(orient_qw, dt_qz, conj_qz);

      float32x4_t cross_x = vmlsq_f32(vmulq_f32(dt_qy, conj_qz), dt_qz, conj_qy);
      float32x4_t cross_y = vmlsq_f32(vmulq_f32(dt_qz, conj_qx), dt_qx, conj_qz);
      float32x4_t cross_z = vmlsq_f32(vmulq_f32(dt_qx, conj_qy), dt_qy, conj_qx);

      float32x4_t orient_qx = vmlaq_f32(vmulq_f32(dt_qw, conj_qx), dt_qx, conj_qw);
      orient_qx = vaddq_f32(orient_qx, cross_x);
      float32x4_t orient_qy = vmlaq_f32(vmulq_f32(dt_qw, conj_qy), dt_qy, conj_qw);
      orient_qy = vaddq_f32(orient_qy, cross_y);
      float32x4_t orient_qz = vmlaq_f32(vmulq_f32(dt_qw, conj_qz), dt_qz, conj_qw);
      orient_qz = vaddq_f32(orient_qz, cross_z);

      // Step 4: Compute dot product (to, from) for shortest arc check
      float32x4_t dot = vmlaq_f32(vmulq_f32(qw, prev_qw), qx, prev_qx);
      dot = vmlaq_f32(dot, qy, prev_qy);
      dot = vmlaq_f32(dot, qz, prev_qz);

      // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment)
      uint32x4_t negate_mask = vcltq_f32(dot, zero_vec); // 1 if dot < 0, 0 otherwise
      float32x4_t sign_selector = vbslq_f32(negate_mask, neg_one_vec, one_vec); // -1.0f if true, 1.0f if false

      orient_qx = vmulq_f32(orient_qx, sign_selector);
      orient_qy = vmulq_f32(orient_qy, sign_selector);
      orient_qz = vmulq_f32(orient_qz, sign_selector);

      // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
      // Inlined vrecip_f32_high_precision
      float32x4_t rcp_two_dt;
      rcp_two_dt = vrecpeq_f32(vmulq_f32(two_vec, dt_vec));
      rcp_two_dt = vmulq_f32(rcp_two_dt, vrecpsq_f32(vmulq_f32(two_vec, dt_vec), rcp_two_dt));
      rcp_two_dt = vmulq_f32(rcp_two_dt, vrecpsq_f32(vmulq_f32(two_vec, dt_vec), rcp_two_dt));

      float32x4_t ang_vel_x_vec = vmulq_f32(orient_qx, rcp_two_dt);
      float32x4_t ang_vel_y_vec = vmulq_f32(orient_qy, rcp_two_dt);
      float32x4_t ang_vel_z_vec = vmulq_f32(orient_qz, rcp_two_dt);

      // Store results
      vst1q_f32(&ang_vel_x[c], ang_vel_x_vec);
      vst1q_f32(&ang_vel_y[c], ang_vel_y_vec);
      vst1q_f32(&ang_vel_z[c], ang_vel_z_vec);
    }
  }
  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
  for (int c = 0; c < MAX_CHNS; ++c) {
    int base_idx = c * PTC_PER_CHN;
    struct chn_cfg *cfg = &cns->chn_cfg[c];

    // Load precomputed velocities and inertia parameters
    float32x4_t vel_x_vec = vdupq_n_f32(vel_x[c]);
    float32x4_t vel_y_vec = vdupq_n_f32(vel_y[c]);
    float32x4_t vel_z_vec = vdupq_n_f32(vel_z[c]);

    float32x4_t ang_vel_x_vec = vdupq_n_f32(ang_vel_x[c]);
    float32x4_t ang_vel_y_vec = vdupq_n_f32(ang_vel_y[c]);
    float32x4_t ang_vel_z_vec = vdupq_n_f32(ang_vel_z[c]);

    float32x4_t linear_inertia_vec = vdupq_n_f32(cfg->linear_inertia);
    float32x4_t angular_inertia_vec = vdupq_n_f32(cfg->angular_inertia);
    float32x4_t centrifugal_inertia_vec = vdupq_n_f32(cfg->centrifugal_inertia);

    // Clamp inertia parameters to [0, 1]
    linear_inertia_vec = vmaxq_f32(vminq_f32(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min);
    angular_inertia_vec = vmaxq_f32(vminq_f32(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min);
    centrifugal_inertia_vec = vmaxq_f32(vminq_f32(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);

    for (int i = 0; i < PTC_PER_CHN; i += 4) {
      float32x4_t px = vld1q_f32(&cns->ptc_pos_x[base_idx + i]);
      float32x4_t py = vld1q_f32(&cns->ptc_pos_y[base_idx + i]);
      float32x4_t pz = vld1q_f32(&cns->ptc_pos_z[base_idx + i]);
      float32x4_t p_ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[base_idx + i]);
      p_ptc_inv_mass = vmaxq_f32(vminq_f32(p_ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);

      // Linear inertia: v * dt * linear_inertia
      float32x4_t lin_dt_x = vmulq_f32(vmulq_f32(vel_x_vec, dt_vec), linear_inertia_vec);
      float32x4_t lin_dt_y = vmulq_f32(vmulq_f32(vel_y_vec, dt_vec), linear_inertia_vec);
      float32x4_t lin_dt_z = vmulq_f32(vmulq_f32(vel_z_vec, dt_vec), linear_inertia_vec);

      // Angular inertia: (ω × r) * dt * angular_inertia
      float32x4_t ang_dt_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, pz), ang_vel_z_vec, py);
      float32x4_t ang_dt_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, px), ang_vel_x_vec, pz);
      float32x4_t ang_dt_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, py), ang_vel_y_vec, px);

      ang_dt_x = vmulq_f32(vmulq_f32(ang_dt_x, dt_vec), angular_inertia_vec);
      ang_dt_y = vmulq_f32(vmulq_f32(ang_dt_y, dt_vec), angular_inertia_vec);
      ang_dt_z = vmulq_f32(vmulq_f32(ang_dt_z, dt_vec), angular_inertia_vec);

      // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
      float32x4_t cross1_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, pz), ang_vel_z_vec, py);
      float32x4_t cross1_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, px), ang_vel_x_vec, pz);
      float32x4_t cross1_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, py), ang_vel_y_vec, px);

      float32x4_t cross2_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, cross1_z), ang_vel_z_vec, cross1_y);
      float32x4_t cross2_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, cross1_x), ang_vel_x_vec, cross1_z);
      float32x4_t cross2_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, cross1_y), ang_vel_y_vec, cross1_x);

      // Calculate displacement: (ω × (ω × r)) * dt^2
      float32x4_t base_cent_dt_x = vmulq_f32(cross2_x, dt2_vec);
      float32x4_t base_cent_dt_y = vmulq_f32(cross2_y, dt2_vec);
      float32x4_t base_cent_dt_z = vmulq_f32(cross2_z, dt2_vec);

      // Apply the centrifugal_inertia factor
      float32x4_t cent_dt_x = vmulq_f32(base_cent_dt_x, centrifugal_inertia_vec);
      float32x4_t cent_dt_y = vmulq_f32(base_cent_dt_y, centrifugal_inertia_vec);
      float32x4_t cent_dt_z = vmulq_f32(base_cent_dt_z, centrifugal_inertia_vec);

      // Total delta
      float32x4_t dt_x = vaddq_f32(vaddq_f32(lin_dt_x, ang_dt_x), cent_dt_x);
      float32x4_t dt_y = vaddq_f32(vaddq_f32(lin_dt_y, ang_dt_y), cent_dt_y);
      float32x4_t dt_z = vaddq_f32(vaddq_f32(lin_dt_z, ang_dt_z), cent_dt_z);

      // Update positions
      vst1q_f32(&cns->ptc_pos_x[base_idx + i], vaddq_f32(px, dt_x));
      vst1q_f32(&cns->ptc_pos_y[base_idx + i], vaddq_f32(py, dt_y));
      vst1q_f32(&cns->ptc_pos_z[base_idx + i], vaddq_f32(pz, dt_z));
    }
    // Update previous transformation
    cns->chn_prev_pos_x[c] = cns->chn_pos_x[c];
    cns->chn_prev_pos_y[c] = cns->chn_pos_y[c];
    cns->chn_prev_pos_z[c] = cns->chn_pos_z[c];

    cns->chn_prev_quat_x[c] = cns->chn_quat_x[c];
    cns->chn_prev_quat_y[c] = cns->chn_quat_y[c];
    cns->chn_prev_quat_z[c] = cns->chn_quat_z[c];
    cns->chn_prev_quat_w[c] = cns->chn_quat_w[c];
  }
  // Step 2: Verlet Integration
  for (int c = 0; c < MAX_CHNS; ++c) {
    int base_idx = c * PTC_PER_CHN;
    struct chn_cfg *cfg = &cns->chn_cfg[c];

    float32x4_t chn_grav_x_vec = vdupq_n_f32(chn_grav_lcl_x[c]);
    float32x4_t chn_grav_y_vec = vdupq_n_f32(chn_grav_lcl_y[c]);
    float32x4_t chn_grav_z_vec = vdupq_n_f32(chn_grav_lcl_z[c]);

    float32x4_t chn_wnd_x_vec = vdupq_n_f32(chn_wnd_lcl_x[c]);
    float32x4_t chn_wnd_y_vec = vdupq_n_f32(chn_wnd_lcl_y[c]);
    float32x4_t chn_wnd_z_vec = vdupq_n_f32(chn_wnd_lcl_z[c]);
    float32x4_t damping_vec = vdupq_n_f32(cfg->damping);

    for (int i = 0; i < PTC_PER_CHN; i += 4) {
      float32x4_t p_curr_x = vld1q_f32(&cns->ptc_pos_x[base_idx + i]);
      float32x4_t p_curr_y = vld1q_f32(&cns->ptc_pos_y[base_idx + i]);
      float32x4_t p_curr_z = vld1q_f32(&cns->ptc_pos_z[base_idx + i]);
      float32x4_t ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[base_idx + i]);
      ptc_inv_mass = vmaxq_f32(vminq_f32(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);

      float32x4_t p_prev_x = vld1q_f32(&cns->ptc_prev_pos_x[base_idx + i]);
      float32x4_t p_prev_y = vld1q_f32(&cns->ptc_prev_pos_y[base_idx + i]);
      float32x4_t p_prev_z = vld1q_f32(&cns->ptc_prev_pos_z[base_idx + i]);

      float32x4_t force_x = vaddq_f32(chn_grav_x_vec, chn_wnd_x_vec);
      float32x4_t force_y = vaddq_f32(chn_grav_y_vec, chn_wnd_y_vec);
      float32x4_t force_z = vaddq_f32(chn_grav_z_vec, chn_wnd_z_vec);

      float32x4_t acc_x = vmulq_f32(force_x, ptc_inv_mass);
      float32x4_t acc_y = vmulq_f32(force_y, ptc_inv_mass);
      float32x4_t acc_z = vmulq_f32(force_z, ptc_inv_mass);

      float32x4_t vel_x = vsubq_f32(p_curr_x, p_prev_x);
      float32x4_t vel_y = vsubq_f32(p_curr_y, p_prev_y);
      float32x4_t vel_z = vsubq_f32(p_curr_z, p_prev_z);

      float32x4_t damped_vel_x = vmulq_f32(vel_x, damping_vec);
      float32x4_t damped_vel_y = vmulq_f32(vel_y, damping_vec);
      float32x4_t damped_vel_z = vmulq_f32(vel_z, damping_vec);

      float32x4_t term_accel_x = vmulq_f32(acc_x, dt2_vec);
      float32x4_t term_accel_y = vmulq_f32(acc_y, dt2_vec);
      float32x4_t term_accel_z = vmulq_f32(acc_z, dt2_vec);

      float32x4_t new_p_x = vaddq_f32(p_curr_x, vaddq_f32(damped_vel_x, term_accel_x));
      float32x4_t new_p_y = vaddq_f32(p_curr_y, vaddq_f32(damped_vel_y, term_accel_y));
      float32x4_t new_p_z = vaddq_f32(p_curr_z, vaddq_f32(damped_vel_z, term_accel_z));

      vst1q_f32(&cns->ptc_pos_x[base_idx + i], new_p_x);
      vst1q_f32(&cns->ptc_pos_y[base_idx + i], new_p_y);
      vst1q_f32(&cns->ptc_pos_z[base_idx + i], new_p_z);

      vst1q_f32(&cns->ptc_prev_pos_x[base_idx + i], p_curr_x);
      vst1q_f32(&cns->ptc_prev_pos_y[base_idx + i], p_curr_y);
      vst1q_f32(&cns->ptc_prev_pos_z[base_idx + i], p_curr_z);
    }
  }
  // Step 3: Distance Constraints
  for (int c = 0; c < MAX_CHNS; ++c) {
    int p_base = c * PTC_PER_CHN;
    int r_base = c * CON_PER_CHN;

    struct chn_cfg *cfg = &cns->chn_cfg[c];
    float32x4_t stiff_vec = vdupq_n_f32(cfg->stiffness);
    float32x4_t stretch_factor_vec = vdupq_n_f32(cfg->stretch_factor);
    float32x4_t max_strain_vec_abs = vdupq_n_f32(cfg->max_strain);
    float32x4_t neg_max_strain_vec_abs = vmulq_f32(max_strain_vec_abs, neg_one_vec);

    // Loop by 4 for NEON
    for (int i = 0; i < PTC_PER_CHN; i += 4) {
      float32x4_t pm = vld1q_f32(&cns->ptc_inv_mass[p_base + i]);
      pm = vmaxq_f32(vminq_f32(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
      vst1q_f32(&px_lcl[i], vld1q_f32(&cns->ptc_pos_x[p_base + i]));
      vst1q_f32(&py_lcl[i], vld1q_f32(&cns->ptc_pos_y[p_base + i]));
      vst1q_f32(&pz_lcl[i], vld1q_f32(&cns->ptc_pos_z[p_base + i]));
      vst1q_f32(&pm_lcl[i], pm);
    }
    for (int iter = 0; iter < MAX_ITR; ++iter) {
      // Loop by 4 for NEON
      for (int i = 0; i < PTC_PER_CHN; i += 4) {
        float32x4_t p0x = vld1q_f32(&px_lcl[i]);
        float32x4_t p0y = vld1q_f32(&py_lcl[i]);
        float32x4_t p0z = vld1q_f32(&pz_lcl[i]);
        float32x4_t p0m = vld1q_f32(&pm_lcl[i]);

        float32x4_t p1x = vld1q_f32(&px_lcl[i + 1]);
        float32x4_t p1y = vld1q_f32(&py_lcl[i + 1]);
        float32x4_t p1z = vld1q_f32(&pz_lcl[i + 1]);
        float32x4_t p1m = vld1q_f32(&pm_lcl[i + 1]);
        float32x4_t rest_len_vec = vld1q_f32(&cns->ptc_rest_len[r_base + i]);

        float32x4_t dx = vsubq_f32(p1x, p0x);
        float32x4_t dy = vsubq_f32(p1y, p0y);
        float32x4_t dz = vsubq_f32(p1z, p0z);

        float32x4_t dist_sq = vmulq_f32(dx, dx);
        dist_sq = vmlaq_f32(dist_sq, dy, dy);
        dist_sq = vmlaq_f32(dist_sq, dz, dz);

        float32x4_t inv_dist;
        float32x4_t x_rsqrt = vmaxq_f32(dist_sq, eps_vec);
        inv_dist = vrsqrteq_f32(x_rsqrt);
        inv_dist = vmulq_f32(inv_dist, vrsqrtsq_f32(x_rsqrt, vmulq_f32(inv_dist, inv_dist)));
        inv_dist = vmulq_f32(inv_dist, vrsqrtsq_f32(x_rsqrt, vmulq_f32(inv_dist, inv_dist)));

        float32x4_t w_sum = vaddq_f32(p0m, p1m);
        float32x4_t w_sum_clamped = vmaxq_f32(w_sum, eps_vec);

        float32x4_t rcp_w_sum;
        rcp_w_sum = vrecpeq_f32(w_sum_clamped);
        rcp_w_sum = vmulq_f32(rcp_w_sum, vrecpsq_f32(w_sum_clamped, rcp_w_sum));
        rcp_w_sum = vmulq_f32(rcp_w_sum, vrecpsq_f32(w_sum_clamped, rcp_w_sum));


        float32x4_t dist;
        dist = vrecpeq_f32(inv_dist);
        dist = vmulq_f32(dist, vrecpsq_f32(inv_dist, dist));
        dist = vmulq_f32(dist, vrecpsq_f32(inv_dist, dist));

        float32x4_t diff = vsubq_f32(dist, rest_len_vec);
        float32x4_t max_rest_len_eps = vmaxq_f32(rest_len_vec, eps_vec);

        float32x4_t rcp_max_rest_len_eps;
        rcp_max_rest_len_eps = vrecpeq_f32(max_rest_len_eps);
        rcp_max_rest_len_eps = vmulq_f32(rcp_max_rest_len_eps, vrecpsq_f32(max_rest_len_eps, rcp_max_rest_len_eps));
        rcp_max_rest_len_eps = vmulq_f32(rcp_max_rest_len_eps, vrecpsq_f32(max_rest_len_eps, rcp_max_rest_len_eps));
        float32x4_t strain = vmulq_f32(diff, rcp_max_rest_len_eps);
        float32x4_t strain_clamped = vmaxq_f32(neg_max_strain_vec_abs, vminq_f32(strain, max_strain_vec_abs));

        float32x4_t dyn_stiff_denom = vaddq_f32(one_vec, vabsq_f32(strain_clamped));
        float32x4_t rcp_dyn_stiff_denom;
        rcp_dyn_stiff_denom = vrecpeq_f32(dyn_stiff_denom);
        rcp_dyn_stiff_denom = vmulq_f32(rcp_dyn_stiff_denom, vrecpsq_f32(dyn_stiff_denom, rcp_dyn_stiff_denom));
        rcp_dyn_stiff_denom = vmulq_f32(rcp_dyn_stiff_denom, vrecpsq_f32(dyn_stiff_denom, rcp_dyn_stiff_denom));
        float32x4_t dyn_stiff = vmulq_f32(stiff_vec, rcp_dyn_stiff_denom);

        float32x4_t corr_scl_part = vmulq_f32(dyn_stiff, stretch_factor_vec);
        corr_scl_part = vmulq_f32(corr_scl_part, rcp_w_sum);
        float32x4_t corr_magnitude = vmulq_f32(diff, corr_scl_part);

        uint32x4_t apply_corr_mask = vcgtq_f32(dist_sq, eps_vec); // 1 if dist_sq > eps_vec, 0 otherwise

        float32x4_t dt_unit_x = vmulq_f32(dx, inv_dist);
        float32x4_t dt_unit_y = vmulq_f32(dy, inv_dist);
        float32x4_t dt_unit_z = vmulq_f32(dz, inv_dist);

        float32x4_t dt_x = vmulq_f32(dt_unit_x, corr_magnitude);
        float32x4_t dt_y = vmulq_f32(dt_unit_y, corr_magnitude);
        float32x4_t dt_z = vmulq_f32(dt_unit_z, corr_magnitude);

        dt_x = vbslq_f32(apply_corr_mask, dt_x, zero_vec); // Apply mask: if mask is 0, set to 0
        dt_y = vbslq_f32(apply_corr_mask, dt_y, zero_vec);
        dt_z = vbslq_f32(apply_corr_mask, dt_z, zero_vec);

        vst1q_f32(&px_lcl[i], vmlaq_f32(p0x, dt_x, p0m));
        vst1q_f32(&py_lcl[i], vmlaq_f32(p0y, dt_y, p0m));
        vst1q_f32(&pz_lcl[i], vmlaq_f32(p0z, dt_z, p0m));

        vst1q_f32(&px_lcl[i + 1], vmlsq_f32(p1x, dt_x, p1m));
        vst1q_f32(&py_lcl[i + 1], vmlsq_f32(p1y, dt_y, p1m));
        vst1q_f32(&pz_lcl[i + 1], vmlsq_f32(p1z, dt_z, p1m));
      }
    }
    for (int i = 0; i < PTC_PER_CHN; i += 4) {
      vst1q_f32(&cns->ptc_pos_x[p_base + i], vld1q_f32(&px_lcl[i]));
      vst1q_f32(&cns->ptc_pos_y[p_base + i], vld1q_f32(&py_lcl[i]));
      vst1q_f32(&cns->ptc_pos_z[p_base + i], vld1q_f32(&pz_lcl[i]));
    }
  }
  // Step 4: Sphere Collisions with Friction
  for (int c = 0; c < MAX_CHNS; ++c) {
    int p_base = c * PTC_PER_CHN;
    int s_base = c * SPH_PER_CHN;
    struct chn_cfg *cfg = &cns->chn_cfg[c];
    float32x4_t fric_vec = vdupq_n_f32(cfg->friction);

    for (int s = 0; s < SPH_PER_CHN; ++s) {
      float32x4_t sph_x_vec = vdupq_n_f32(cns->sph_x[s_base + s]);
      float32x4_t sph_y_vec = vdupq_n_f32(cns->sph_y[s_base + s]);
      float32x4_t sph_z_vec = vdupq_n_f32(cns->sph_z[s_base + s]);
      float32x4_t sph_r_vec = vdupq_n_f32(cns->sph_r[s_base + s]);

      for (int i = 0; i < PTC_PER_CHN; i += 4) {
        float32x4_t p_curr_x = vld1q_f32(&cns->ptc_pos_x[p_base + i]);
        float32x4_t p_curr_y = vld1q_f32(&cns->ptc_pos_y[p_base + i]);
        float32x4_t p_curr_z = vld1q_f32(&cns->ptc_pos_z[p_base + i]);
        
        float32x4_t p_prev_x = vld1q_f32(&cns->ptc_prev_pos_x[p_base + i]);
        float32x4_t p_prev_y = vld1q_f32(&cns->ptc_prev_pos_y[p_base + i]);
        float32x4_t p_prev_z = vld1q_f32(&cns->ptc_prev_pos_z[p_base + i]);
        float32x4_t ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[p_base + i]);
        ptc_inv_mass = vmaxq_f32(vminq_f32(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);

        float32x4_t dx = vsubq_f32(p_curr_x, sph_x_vec);
        float32x4_t dy = vsubq_f32(p_curr_y, sph_y_vec);
        float32x4_t dz = vsubq_f32(p_curr_z, sph_z_vec);

        float32x4_t dist_sq = vmulq_f32(dx, dx);
        dist_sq = vmlaq_f32(dist_sq, dy, dy);
        dist_sq = vmlaq_f32(dist_sq, dz, dz);

        float32x4_t inv_dist_sphere;
        float32x4_t x_rsqrt_sphere = vmaxq_f32(dist_sq, eps_vec);
        inv_dist_sphere = vrsqrteq_f32(x_rsqrt_sphere);
        inv_dist_sphere = vmulq_f32(inv_dist_sphere, vrsqrtsq_f32(x_rsqrt_sphere, vmulq_f32(inv_dist_sphere, inv_dist_sphere)));
        inv_dist_sphere = vmulq_f32(inv_dist_sphere, vrsqrtsq_f32(x_rsqrt_sphere, vmulq_f32(inv_dist_sphere, inv_dist_sphere)));
        float32x4_t inv_dist = inv_dist_sphere;

        float32x4_t dist_sphere;
        dist_sphere = vrecpeq_f32(inv_dist);
        dist_sphere = vmulq_f32(dist_sphere, vrecpsq_f32(inv_dist, dist_sphere));
        dist_sphere = vmulq_f32(dist_sphere, vrecpsq_f32(inv_dist, dist_sphere));
        float32x4_t dist = dist_sphere;

        float32x4_t pen = vsubq_f32(sph_r_vec, dist);
        uint32x4_t col_mask = vcgtq_f32(pen, zero_vec); // 1 if pen > 0, 0 otherwise

        float32x4_t norm_x = vmulq_f32(dx, inv_dist);
        float32x4_t norm_y = vmulq_f32(dy, inv_dist);
        float32x4_t norm_z = vmulq_f32(dz, inv_dist);

        float32x4_t norm_corr_x = vmulq_f32(norm_x, pen);
        float32x4_t norm_corr_y = vmulq_f32(norm_y, pen);
        float32x4_t norm_corr_z = vmulq_f32(norm_z, pen);

        float32x4_t vel_x = vsubq_f32(p_curr_x, p_prev_x);
        float32x4_t vel_y = vsubq_f32(p_curr_y, p_prev_y);
        float32x4_t vel_z = vsubq_f32(p_curr_z, p_prev_z);

        float32x4_t vel_dot_norm = vmulq_f32(vel_x, norm_x);
        vel_dot_norm = vmlaq_f32(vel_dot_norm, vel_y, norm_y);
        vel_dot_norm = vmlaq_f32(vel_dot_norm, vel_z, norm_z);

        float32x4_t tan_x = vmlsq_f32(vel_x, vel_dot_norm, norm_x);
        float32x4_t tan_y = vmlsq_f32(vel_y, vel_dot_norm, norm_y);
        float32x4_t tan_z = vmlsq_f32(vel_z, vel_dot_norm, norm_z);

        float32x4_t tan_mag_sq = vmulq_f32(tan_x, tan_x);
        tan_mag_sq = vmlaq_f32(tan_mag_sq, tan_y, tan_y);
        tan_mag_sq = vmlaq_f32(tan_mag_sq, tan_z, tan_z);

        float32x4_t inv_tan_mag_sphere;
        float32x4_t x_rsqrt_tan_mag = vmaxq_f32(tan_mag_sq, eps_vec);
        inv_tan_mag_sphere = vrsqrteq_f32(x_rsqrt_tan_mag);
        inv_tan_mag_sphere = vmulq_f32(inv_tan_mag_sphere, vrsqrtsq_f32(x_rsqrt_tan_mag, vmulq_f32(inv_tan_mag_sphere, inv_tan_mag_sphere)));
        inv_tan_mag_sphere = vmulq_f32(inv_tan_mag_sphere, vrsqrtsq_f32(x_rsqrt_tan_mag, vmulq_f32(inv_tan_mag_sphere, inv_tan_mag_sphere)));
        float32x4_t inv_tan_mag = inv_tan_mag_sphere;

        float32x4_t fric_mag = vmulq_f32(pen, fric_vec);
        float32x4_t fric_x = vmulq_f32(vmulq_f32(tan_x, inv_tan_mag), fric_mag);
        float32x4_t fric_y = vmulq_f32(vmulq_f32(tan_y, inv_tan_mag), fric_mag);
        float32x4_t fric_z = vmulq_f32(vmulq_f32(tan_z, inv_tan_mag), fric_mag);

        float32x4_t total_corr_x = vsubq_f32(norm_corr_x, fric_x);
        float32x4_t total_corr_y = vsubq_f32(norm_corr_y, fric_y);
        float32x4_t total_corr_z = vsubq_f32(norm_corr_z, fric_z);

        total_corr_x = vbslq_f32(col_mask, total_corr_x, zero_vec);
        total_corr_y = vbslq_f32(col_mask, total_corr_y, zero_vec);
        total_corr_z = vbslq_f32(col_mask, total_corr_z, zero_vec);

        total_corr_x = vmulq_f32(total_corr_x, ptc_inv_mass);
        total_corr_y = vmulq_f32(total_corr_y, ptc_inv_mass);
        total_corr_z = vmulq_f32(total_corr_z, ptc_inv_mass);

        float32x4_t new_p_x = vaddq_f32(p_curr_x, total_corr_x);
        float32x4_t new_p_y = vaddq_f32(p_curr_y, total_corr_y);
        float32x4_t new_p_z = vaddq_f32(p_curr_z, total_corr_z);

        vst1q_f32(&cns->ptc_pos_x[p_base + i], new_p_x);
        vst1q_f32(&cns->ptc_pos_y[p_base + i], new_p_y);
        vst1q_f32(&cns->ptc_pos_z[p_base + i], new_p_z);
      }
    }
  }
  // Step 5: Motion Constraints
  for (int c = 0; c < MAX_CHNS; ++c) {
    int p_base = c * PTC_PER_CHN;
    // Loop by 4 for NEON
    for (int i = 0; i < PTC_PER_CHN; i += 4) {
      float32x4_t px = vld1q_f32(&cns->ptc_pos_x[p_base + i]);
      float32x4_t py = vld1q_f32(&cns->ptc_pos_y[p_base + i]);
      float32x4_t pz = vld1q_f32(&cns->ptc_pos_z[p_base + i]);
      float32x4_t pm = vld1q_f32(&cns->ptc_inv_mass[p_base + i]);
      pm = vmaxq_f32(vminq_f32(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);

      float32x4_t rx = vld1q_f32(&cns->rest_pos_x[p_base + i]);
      float32x4_t ry = vld1q_f32(&cns->rest_pos_y[p_base + i]);
      float32x4_t rz = vld1q_f32(&cns->rest_pos_z[p_base + i]);
      float32x4_t r_vec = vld1q_f32(&cns->motion_radius[p_base + i]);

      float32x4_t dx = vsubq_f32(px, rx);
      float32x4_t dy = vsubq_f32(py, ry);
      float32x4_t dz = vsubq_f32(pz, rz);

      float32x4_t dist_sq = vmulq_f32(dx, dx);
      dist_sq = vmlaq_f32(dist_sq, dy, dy);
      dist_sq = vmlaq_f32(dist_sq, dz, dz);

      float32x4_t inv_dist_motion;
      float32x4_t x_rsqrt_motion = vmaxq_f32(dist_sq, eps_vec);
      inv_dist_motion = vrsqrteq_f32(x_rsqrt_motion);
      inv_dist_motion = vmulq_f32(inv_dist_motion, vrsqrtsq_f32(x_rsqrt_motion, vmulq_f32(inv_dist_motion, inv_dist_motion)));
      inv_dist_motion = vmulq_f32(inv_dist_motion, vrsqrtsq_f32(x_rsqrt_motion, vmulq_f32(inv_dist_motion, inv_dist_motion)));
      float32x4_t inv_dist = inv_dist_motion;

      float32x4_t dist_motion;
      dist_motion = vrecpeq_f32(inv_dist);
      dist_motion = vmulq_f32(dist_motion, vrecpsq_f32(inv_dist, dist_motion));
      dist_motion = vmulq_f32(dist_motion, vrecpsq_f32(inv_dist, dist_motion));
      float32x4_t dist = dist_motion;

      float32x4_t pen = vsubq_f32(dist, r_vec);
      uint32x4_t mask = vcgtq_f32(pen, zero_vec); // 1 if pen > 0, 0 otherwise

      float32x4_t corr_factor = vmulq_f32(pen, inv_dist);
      corr_factor = vbslq_f32(mask, corr_factor, zero_vec);

      float32x4_t dt_x = vmulq_f32(dx, corr_factor);
      float32x4_t dt_y = vmulq_f32(dy, corr_factor);
      float32x4_t dt_z = vmulq_f32(dz, corr_factor);

      dt_x = vmulq_f32(dt_x, pm);
      dt_y = vmulq_f32(dt_y, pm);
      dt_z = vmulq_f32(dt_z, pm);

      vst1q_f32(&cns->ptc_pos_x[p_base + i], vsubq_f32(px, dt_x));
      vst1q_f32(&cns->ptc_pos_y[p_base + i], vsubq_f32(py, dt_y));
      vst1q_f32(&cns->ptc_pos_z[p_base + i], vsubq_f32(pz, dt_z));
    }
  }
 }
diff --git a/chain_avbd.c b/chain_avbd.c
 #include <immintrin.h>
 #include <math.h>

 enum {
  MAX_CHAINS = 64,
  PARTICLES_PER_CHAIN = 16,
  SPHERES_PER_CHAIN = 8,
  MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN),
  MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN),
  CONSTRAINTS_PER_CHAIN = PARTICLES_PER_CHAIN,
  MAX_REST_LENGTHS = (MAX_CHAINS * CONSTRAINTS_PER_CHAIN),
  MAX_ITERATIONS = 4,
 };
 struct chain_cfg {
  float damping;
  float friction;
  float collision_radius;
  float linear_inertia;      // [0, 1], 1.0f = physically correct linear motion
  float angular_inertia;     // [0, 1], 1.0f = physically correct angular rotation
  float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force

  float avbd_alpha;          // Regularization parameter (e.g., 0.95f)
  float avbd_beta;           // Stiffness ramping control (e.g., 10.0f)
  float avbd_gamma;          // Warm starting scaling (e.g., 0.99f)
  float avbd_k_start;        // Initial stiffness for AVBD (e.g., 100.0f)
  float avbd_k_max;          // Maximum stiffness for AVBD (e.g., 1e6f)
 } __attribute__((aligned(32)));

 struct chain_sim {
  unsigned short free_idx_cnt;
  unsigned short free_idx[MAX_CHAINS];
  struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32)));

  // Chain global forces (world space gravity and wind)
  float gravity_x[MAX_CHAINS] __attribute__((aligned(32)));
  float gravity_y[MAX_CHAINS] __attribute__((aligned(32)));
  float gravity_z[MAX_CHAINS] __attribute__((aligned(32)));

  float wind_x[MAX_CHAINS] __attribute__((aligned(32)));
  float wind_y[MAX_CHAINS] __attribute__((aligned(32)));
  float wind_z[MAX_CHAINS] __attribute__((aligned(32)));

  // Chain transformations (world space)
  float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
  float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
  float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
  
  float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
  float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
  float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
  float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
  
  float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
  float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
  float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
  
  float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
  float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
  float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
  float prev_chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));

  // Particle positions (model space)
  float p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
  float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
  float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
  float inv_mass[MAX_PARTICLES] __attribute__((aligned(32)));
  
  float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
  float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
  float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));

  float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32)));
  float dist_lambda[MAX_REST_LENGTHS] __attribute__((aligned(32)));
  float dist_k_current[MAX_REST_LENGTHS] __attribute__((aligned(32)));
  float dist_prev_frame_error[MAX_REST_LENGTHS] __attribute__((aligned(32)));

  // Sphere positions (model space)
  float sphere_x[MAX_SPHERES] __attribute__((aligned(32)));
  float sphere_y[MAX_SPHERES] __attribute__((aligned(32)));
  float sphere_z[MAX_SPHERES] __attribute__((aligned(32)));
  float sphere_radius[MAX_SPHERES] __attribute__((aligned(32)));

  float coll_lambda[MAX_PARTICLES + 1] __attribute__((aligned(32)));
  float coll_k_current[MAX_PARTICLES + 1] __attribute__((aligned(32)));
  float coll_prev_frame_error[MAX_PARTICLES + 1] __attribute__((aligned(32)));

  // Rest positions (model space)
  float rest_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
  float rest_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
  float rest_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
  float motion_radius[MAX_PARTICLES] __attribute__((aligned(32)));
 };

 void simulate_chains(struct chain_sim *cs, float dt) {
  // Common SIMD constants
  const __m256 dt_vec = _mm256_set1_ps(dt);
  const __m256 dt2_vec = _mm256_set1_ps(dt * dt);
  const __m256 one_vec = _mm256_set1_ps(1.0f);
  const __m256 neg_one_vec = _mm256_set1_ps(-1.0f);
  const __m256 eps_vec = _mm256_set1_ps(1e-6f);
  const __m256 inv_mass_clamp_min = _mm256_set1_ps(0.0f);
  const __m256 inv_mass_clamp_max = _mm256_set1_ps(1e6f);
  const __m256 zero_vec = _mm256_setzero_ps();
  const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt);
  const __m256 two_vec = _mm256_set1_ps(2.0f);
  const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f);
  const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f);
  const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));

  // Local arrays for PBD solver
  float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
  float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
  float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
  float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));

  // Initialize dummy elements
  px_local[PARTICLES_PER_CHAIN] = 0.0f;
  py_local[PARTICLES_PER_CHAIN] = 0.0f;
  pz_local[PARTICLES_PER_CHAIN] = 0.0f;
  pm_local[PARTICLES_PER_CHAIN] = 0.0f;

  // Stack arrays for precomputed data
  float wind_local_x[MAX_CHAINS] __attribute__((aligned(32)));
  float wind_local_y[MAX_CHAINS] __attribute__((aligned(32)));
  float wind_local_z[MAX_CHAINS] __attribute__((aligned(32)));

  float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32)));
  float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32)));
  float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32)));

  float vel_x[MAX_CHAINS] __attribute__((aligned(32)));
  float vel_y[MAX_CHAINS] __attribute__((aligned(32)));
  float vel_z[MAX_CHAINS] __attribute__((aligned(32)));

  float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32)));
  float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32)));
  float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32)));

  // Step 0: Precomputation (SIMD, 8 chains at once)
  for (int c = 0; c < MAX_CHAINS; c += 8) {
    // Load chain quaternions
    __m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]);
    __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]);
    __m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]);
    __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]);
    // Compute local-space wind
    {
      __m256 vx = _mm256_load_ps(&cs->wind_x[c]);
      __m256 vy = _mm256_load_ps(&cs->wind_y[c]);
      __m256 vz = _mm256_load_ps(&cs->wind_z[c]);

      // Create q_conjugate components (qw remains, qx, qy, qz are negated)
      __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
      __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
      __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz

      // Step 1: t = q_conj * v_world_as_quat
      // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
      // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
      // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
      // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
      // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx

      __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
      tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy

      __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
      ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz

      __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
      tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx

      __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec);  // 0 - (-qx)*vx = qx*vx
      tw = _mm256_fnmadd_ps(cqy, vy, tw);  // qx*vx - (-qy)*vy = qx*vx + qy*vy
      tw = _mm256_fnmadd_ps(cqz, vz, tw);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz

      // Step 2: result_vec = (t * q)_vec
      // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
      // result_x = tw*qx + tx*qw + ty*qz - tz*qy
      // result_y = tw*qy + ty*qw + tz*qx - tx*qz
      // result_z = tw*qz + ty*qw + tx*qy - ty*qx

      __m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
      result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x));
      __m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
      result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y));
      __m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
      result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z));

      _mm256_store_ps(&wind_local_x[c], result_x);
      _mm256_store_ps(&wind_local_y[c], result_y);
      _mm256_store_ps(&wind_local_z[c], result_z);
    }
    // Compute local-space gravity
    {
      __m256 vx = _mm256_load_ps(&cs->gravity_x[c]);
      __m256 vy = _mm256_load_ps(&cs->gravity_y[c]);
      __m256 vz = _mm256_load_ps(&cs->gravity_z[c]);

    // Create q_conjugate components (qw remains, qx, qy, qz are negated)
      __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
      __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
      __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz

      // Step 1: t = q_conj * v_world_as_quat
      // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
      // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
      // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
      // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
      // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx

      __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
      tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy

      __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
      ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz

      __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
      tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx

      __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec);  // 0 - (-qx)*vx = qx*vx
      tw = _mm256_fnmadd_ps(cqy, vy, tw);  // qx*vx - (-qy)*vy = qx*vx + qy*vy
      tw = _mm256_fnmadd_ps(cqz, vz, tw);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz

      // Step 2: result_vec = (t * q)_vec
      // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
      // result_x = tw*qx + tx*qw + ty*qz - tz*qy
      // result_y = tw*qy + ty*qw + tz*qx - tx*qz
      // result_z = tw*qz + ty*qw + tx*qy - ty*qx

      __m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
      result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x));
      __m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
      result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y));
      __m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
      result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z));

      _mm256_store_ps(&gravity_local_x[c], result_x);
      _mm256_store_ps(&gravity_local_y[c], result_y);
      _mm256_store_ps(&gravity_local_z[c], result_z);
    }
    // Compute linear velocity
    {
      __m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]);
      __m256 curr_y = _mm256_load_ps(&cs->chain_pos_y[c]);
      __m256 curr_z = _mm256_load_ps(&cs->chain_pos_z[c]);

      __m256 prev_x = _mm256_load_ps(&cs->prev_chain_pos_x[c]);
      __m256 prev_y = _mm256_load_ps(&cs->prev_chain_pos_y[c]);
      __m256 prev_z = _mm256_load_ps(&cs->prev_chain_pos_z[c]);

      __m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec);
      __m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec);
      __m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec);

      _mm256_store_ps(&vel_x[c], vel_x_vec);
      _mm256_store_ps(&vel_y[c], vel_y_vec);
      _mm256_store_ps(&vel_z[c], vel_z_vec);
    }
    // Compute angular velocity
    {
      __m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]);
      __m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]);
      __m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]);
      __m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]);

      __m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]);
      __m256 prev_qy = _mm256_load_ps(&cs->prev_chain_quat_y[c]);
      __m256 prev_qz = _mm256_load_ps(&cs->prev_chain_quat_z[c]);
      __m256 prev_qw = _mm256_load_ps(&cs->prev_chain_quat_w[c]);

      // Step 1: Compute delta quaternion (to - from)
      __m256 delta_qx = _mm256_sub_ps(qx, prev_qx);
      __m256 delta_qy = _mm256_sub_ps(qy, prev_qy);
      __m256 delta_qz = _mm256_sub_ps(qz, prev_qz);
      __m256 delta_qw = _mm256_sub_ps(qw, prev_qw);

      // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz)
      __m256 conj_qx = _mm256_sub_ps(zero_vec, prev_qx);
      __m256 conj_qy = _mm256_sub_ps(zero_vec, prev_qy);
      __m256 conj_qz = _mm256_sub_ps(zero_vec, prev_qz);
      __m256 conj_qw = prev_qw;

      // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
      // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2)
      // Result: orientationDelta = (delta_qw, delta_qx, delta_qy, delta_qz) * (conj_qw, conj_qx, conj_qy, conj_qz)
      __m256 orient_qw = _mm256_fnmadd_ps(delta_qx, conj_qx, _mm256_mul_ps(delta_qw, conj_qw));
      orient_qw = _mm256_fnmadd_ps(delta_qy, conj_qy, orient_qw);
      orient_qw = _mm256_fnmadd_ps(delta_qz, conj_qz, orient_qw);

      __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(delta_qy, conj_qz), _mm256_mul_ps(delta_qz, conj_qy));
      __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(delta_qz, conj_qx), _mm256_mul_ps(delta_qx, conj_qz));
      __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(delta_qx, conj_qy), _mm256_mul_ps(delta_qy, conj_qx));

      __m256 orient_qx = _mm256_fmadd_ps(delta_qw, conj_qx, _mm256_mul_ps(delta_qx, conj_qw));
      orient_qx = _mm256_add_ps(orient_qx, cross_x);
      __m256 orient_qy = _mm256_fmadd_ps(delta_qw, conj_qy, _mm256_mul_ps(delta_qy, conj_qw));
      orient_qy = _mm256_add_ps(orient_qy, cross_y);
      __m256 orient_qz = _mm256_fmadd_ps(delta_qw, conj_qz, _mm256_mul_ps(delta_qz, conj_qw));
      orient_qz = _mm256_add_ps(orient_qz, cross_z);

      // Step 4: Compute dot product (to, from) for shortest arc check
      __m256 dot = _mm256_fmadd_ps(qx, prev_qx, _mm256_mul_ps(qw, prev_qw));
      dot = _mm256_fmadd_ps(qy, prev_qy, dot);
      dot = _mm256_fmadd_ps(qz, prev_qz, dot);

      // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment)
      __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ);
      __m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec), 
                                       _mm256_andnot_ps(negate_mask, one_vec));

      orient_qx = _mm256_mul_ps(orient_qx, negate_sign);
      orient_qy = _mm256_mul_ps(orient_qy, negate_sign);
      orient_qz = _mm256_mul_ps(orient_qz, negate_sign);

      // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
      __m256 scale = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec));
      __m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale);
      __m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale);
      __m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale);

      // Store results
      _mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec);
      _mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec);
      _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec);
    }
  }
  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
  for (int c = 0; c < MAX_CHAINS; c++) {
    int base_idx = c * PARTICLES_PER_CHAIN;
    struct chain_cfg *cfg = &cs->chain_configs[c];

    // Load precomputed velocities and inertia parameters
    __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]);
    __m256 vel_y_vec = _mm256_set1_ps(vel_y[c]);
    __m256 vel_z_vec = _mm256_set1_ps(vel_z[c]);

    __m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]);
    __m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]);
    __m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]);

    __m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia);
    __m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia);
    __m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia);

    // Clamp inertia parameters to [0, 1]
    linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min);
    angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min);
    centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);

    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
      __m256 px = _mm256_load_ps(&cs->p_pos_x[base_idx + i]);
      __m256 py = _mm256_load_ps(&cs->p_pos_y[base_idx + i]);
      __m256 pz = _mm256_load_ps(&cs->p_pos_z[base_idx + i]);
      __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]);
      p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);

      // Linear inertia: v * dt * linear_inertia
      __m256 lin_delta_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec);
      __m256 lin_delta_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec);
      __m256 lin_delta_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec);

      // Angular inertia: (ω × r) * dt * angular_inertia
      __m256 ang_delta_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
      __m256 ang_delta_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
      __m256 ang_delta_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));

      ang_delta_x = _mm256_mul_ps(_mm256_mul_ps(ang_delta_x, dt_vec), angular_inertia_vec);
      ang_delta_y = _mm256_mul_ps(_mm256_mul_ps(ang_delta_y, dt_vec), angular_inertia_vec);
      ang_delta_z = _mm256_mul_ps(_mm256_mul_ps(ang_delta_z, dt_vec), angular_inertia_vec);

      // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
      __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
      __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
      __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));

      __m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y));
      __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z));
      __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x));

      // Calculate displacement: (ω × (ω × r)) * dt^2
      __m256 base_cent_delta_x = _mm256_mul_ps(cross2_x, dt2_vec);
      __m256 base_cent_delta_y = _mm256_mul_ps(cross2_y, dt2_vec);
      __m256 base_cent_delta_z = _mm256_mul_ps(cross2_z, dt2_vec);

      // Apply the centrifugal_inertia factor
      __m256 cent_delta_x = _mm256_mul_ps(base_cent_delta_x, centrifugal_inertia_vec);
      __m256 cent_delta_y = _mm256_mul_ps(base_cent_delta_y, centrifugal_inertia_vec);
      __m256 cent_delta_z = _mm256_mul_ps(base_cent_delta_z, centrifugal_inertia_vec);

      // Total delta
      __m256 delta_x = _mm256_add_ps(_mm256_add_ps(lin_delta_x, ang_delta_x), cent_delta_x);
      __m256 delta_y = _mm256_add_ps(_mm256_add_ps(lin_delta_y, ang_delta_y), cent_delta_y);
      __m256 delta_z = _mm256_add_ps(_mm256_add_ps(lin_delta_z, ang_delta_z), cent_delta_z);

      // Update positions
      _mm256_store_ps(&cs->p_pos_x[base_idx + i], _mm256_add_ps(px, delta_x));
      _mm256_store_ps(&cs->p_pos_y[base_idx + i], _mm256_add_ps(py, delta_y));
      _mm256_store_ps(&cs->p_pos_z[base_idx + i], _mm256_add_ps(pz, delta_z));
    }
    // Update previous transformation
    cs->prev_chain_pos_x[c] = cs->chain_pos_x[c];
    cs->prev_chain_pos_y[c] = cs->chain_pos_y[c];
    cs->prev_chain_pos_z[c] = cs->chain_pos_z[c];

    cs->prev_chain_quat_x[c] = cs->chain_quat_x[c];
    cs->prev_chain_quat_y[c] = cs->chain_quat_y[c];
    cs->prev_chain_quat_z[c] = cs->chain_quat_z[c];
    cs->prev_chain_quat_w[c] = cs->chain_quat_w[c];
  }
  // Step 2: Verlet Integration
  for (int c = 0; c < MAX_CHAINS; c++) {
    int base_idx = c * PARTICLES_PER_CHAIN;
    struct chain_cfg *cfg = &cs->chain_configs[c];

    __m256 gravity_x_vec = _mm256_set1_ps(gravity_local_x[c]);
    __m256 gravity_y_vec = _mm256_set1_ps(gravity_local_y[c]);
    __m256 gravity_z_vec = _mm256_set1_ps(gravity_local_z[c]);

    __m256 wind_x_vec = _mm256_set1_ps(wind_local_x[c]);
    __m256 wind_y_vec = _mm256_set1_ps(wind_local_y[c]);
    __m256 wind_z_vec = _mm256_set1_ps(wind_local_z[c]);
    __m256 damping_vec = _mm256_set1_ps(cfg->damping);

    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
      __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[base_idx + i]);
      __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[base_idx + i]);
      __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[base_idx + i]);
      __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]);
      p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);

      __m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[base_idx + i]);
      __m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[base_idx + i]);
      __m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[base_idx + i]);

      __m256 force_x = _mm256_add_ps(gravity_x_vec, wind_x_vec);
      __m256 force_y = _mm256_add_ps(gravity_y_vec, wind_y_vec);
      __m256 force_z = _mm256_add_ps(gravity_z_vec, wind_z_vec);

      __m256 acc_x = _mm256_mul_ps(force_x, p_inv_mass);
      __m256 acc_y = _mm256_mul_ps(force_y, p_inv_mass);
      __m256 acc_z = _mm256_mul_ps(force_z, p_inv_mass);

      __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
      __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
      __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);

      __m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec);
      __m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec);
      __m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec);

      __m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec);
      __m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec);
      __m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec);

      __m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x));
      __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y));
      __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z));

      _mm256_store_ps(&cs->p_pos_x[base_idx + i], new_p_x);
      _mm256_store_ps(&cs->p_pos_y[base_idx + i], new_p_y);
      _mm256_store_ps(&cs->p_pos_z[base_idx + i], new_p_z);

      _mm256_store_ps(&cs->prev_p_pos_x[base_idx + i], p_curr_x);
      _mm256_store_ps(&cs->prev_p_pos_y[base_idx + i], p_curr_y);
      _mm256_store_ps(&cs->prev_p_pos_z[base_idx + i], p_curr_z);
    }
  }

  // --- AVBD Warm-starting of lambda and k_current for all distance constraints.
  for (int c = 0; c < MAX_CHAINS; ++c) {
    struct chain_cfg* cfg = &cs->chain_configs[c];
    __m256 alpha_scalar_vec = _mm256_set1_ps(cfg->avbd_alpha);
    __m256 gamma_scalar_vec = _mm256_set1_ps(cfg->avbd_gamma);
    __m256 k_start_scalar_vec = _mm256_set1_ps(cfg->avbd_k_start);

    // Loop through each block of 8 constraints for warm-starting
    int r_base = c * CONSTRAINTS_PER_CHAIN; // Base index for this chain's constraints
    for (int i = 0; i < CONSTRAINTS_PER_CHAIN - 1; i += 8) {
        // Load current lambda, k_current for 8 constraints simultaneously
        __m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + i]);
        __m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + i]);
        // Apply warm-starting (Eq. 19 from paper)
        // lambda_new = lambda_prev_frame * alpha * gamma
        current_lambda_vec = _mm256_mul_ps(current_lambda_vec, _mm256_mul_ps(alpha_scalar_vec, gamma_scalar_vec));
        // k_current_new = max(K_START, k_current_prev_frame * gamma)
        current_k_vec = _mm256_max_ps(k_start_scalar_vec, _mm256_mul_ps(current_k_vec, gamma_scalar_vec));
        // Store updated warm-started values back
        _mm256_store_ps(&cs->dist_lambda[r_base + i], current_lambda_vec);
        _mm256_store_ps(&cs->dist_k_current[r_base + i], current_k_vec);
    }
  }
  // Warm-starting for Collision Constraints ---
  for (int c_idx = 0; c_idx < MAX_CHAINS; ++c_idx) {
      struct chain_cfg* cfg = &cs->chain_configs[c_idx];
      __m256 alpha_scalar_vec = _mm256_set1_ps(cfg->avbd_alpha);
      __m256 gamma_scalar_vec = _mm256_set1_ps(cfg->avbd_gamma);
      __m256 k_start_scalar_vec = _mm256_set1_ps(cfg->avbd_k_start);

      int p_base = c_idx * PARTICLES_PER_CHAIN; // Base index for this chain's particles
      for (int i = 0; i < PARTICLES_PER_CHAIN; i += VEC_FLOAT_COUNT) {
          __m256 current_lambda_vec = _mm256_load_ps(&cs->coll_lambda[p_base + i]);
          __m256 current_k_vec = _mm256_load_ps(&cs->coll_k_current[p_base + i]);

          current_lambda_vec = _mm256_mul_ps(current_lambda_vec, _mm256_mul_ps(alpha_scalar_vec, gamma_scalar_vec));
          current_k_vec = _mm256_max_ps(k_start_scalar_vec, _mm256_mul_ps(current_k_vec, gamma_scalar_vec));

          _mm256_store_ps(&cs->coll_lambda[p_base + i], current_lambda_vec);
          _mm256_store_ps(&cs->coll_k_current[p_base + i], current_k_vec);
      }
  }
  for (int c = 0; c < MAX_CHAINS; c++) {
    struct chain_cfg *cfg = &cs->chain_configs[c];
    __m256 alpha_vec = _mm256_set1_ps(cfg->avbd_alpha);
    __m256 beta_vec = _mm256_set1_ps(cfg->avbd_beta);
    __m256 k_max_vec = _mm256_set1_ps(cfg->avbd_k_max);

    int p_base = c * PARTICLES_PER_CHAIN;
    int r_base = c * CONSTRAINTS_PER_CHAIN;
    // Load all current particle positions into local buffer (once per chain per iteration)
    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
      _mm256_store_ps(&px_local[i], _mm256_load_ps(&cs->p_pos_x[p_base + i]));
      _mm256_store_ps(&py_local[i], _mm256_load_ps(&cs->p_pos_y[p_base + i]));
      _mm256_store_ps(&pz_local[i], _mm256_load_ps(&cs->p_pos_z[p_base + i]));
      _mm256_store_ps(&pm_local[i], _mm256_load_ps(&cs->inv_mass[p_base + i]));
      _mm256_store_ps(&pm_local[i], _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&pm_local[i]), inv_mass_clamp_max), inv_mass_clamp_min));
    }
    for (int iter = 0; iter < MAX_ITERATIONS; iter++) {
      // First block (i=0 to 7 of particles, processing constraints 0 to 7)
      {
        // p0 is aligned from px_local[0]
        __m256 p0x = _mm256_load_ps(&px_local[0]);
        __m256 p0y = _mm256_load_ps(&py_local[0]);
        __m256 p0z = _mm256_load_ps(&pz_local[0]);
        __m256 p0m = _mm256_load_ps(&pm_local[0]);
        
        // p1 is unaligned from px_local[1]
        __m256 p1x = _mm256_loadu_ps(&px_local[1]);
        __m256 p1y = _mm256_loadu_ps(&py_local[1]);
        __m256 p1z = _mm256_loadu_ps(&pz_local[1]);
        __m256 p1m = _mm256_loadu_ps(&pm_local[1]);
        
        // This corresponds to rest_lengths[r_base + 0] through rest_lengths[r_base + 7]
        __m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base]);

        // Calculate inverse sum of masses (rcp_w_sum)
        __m256 w_sum = _mm256_add_ps(p0m, p1m);
        __m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
        __m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped);

        // Calculate current vector and distance between particles
        __m256 dx_vec = _mm256_sub_ps(p1x, p0x);
        __m256 dy_vec = _mm256_sub_ps(p1y, p0y);
        __m256 dz_vec = _mm256_sub_ps(p1z, p0z);

        __m256 dist_sq = _mm256_mul_ps(dx_vec, dx_vec);
        dist_sq = _mm256_fmadd_ps(dy_vec, dy_vec, dist_sq);
        dist_sq = _mm256_fmadd_ps(dz_vec, dz_vec, dist_sq);

        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); // 1/dist
        __m256 dist = _mm256_rcp_ps(inv_dist); // Current distance

        // Primal Update for Distance Constraints (Block 1) ---
        // Load AVBD state for this block of 8 constraints (indices r_base + 0 to r_base + 7)
        __m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + 0]);
        __m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + 0]);
        __m256 prev_frame_error_vec = _mm256_load_ps(&cs->dist_prev_frame_error[r_base + 0]);

        // Original constraint error C_j*(x) = current_dist - target_length
        __m256 C_x_original = _mm256_sub_ps(dist, rest_len_vec);

        // Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) (Eq. 18)
        __m256 C_x_regularized = _mm256_sub_ps(C_x_original, _mm256_mul_ps(alpha_vec, prev_frame_error_vec));

        // Calculate the total force term based on Augmented Lagrangian: -(k * C_reg + lambda)
        // The effective displacement is proportional to this force divided by effective stiffness (k) and mass.
        __m256 force_term = _mm256_fmadd_ps(current_k_vec, C_x_regularized, current_lambda_vec);
        
        // Calculate the scalar part of the correction, similar to original 'corr_scalar_part'
        // This is simplified, representing the desired displacement due to the augmented force
        __m256 effective_corr_scalar_part = _mm256_div_ps(force_term, current_k_vec); // Simplified displacement factor
        
        // Normalize difference vector
        __m256 delta_unit_x = _mm256_mul_ps(dx_vec, inv_dist);
        __m256 delta_unit_y = _mm256_mul_ps(dy_vec, inv_dist);
        __m256 delta_unit_z = _mm256_mul_ps(dz_vec, inv_dist);

        // Calculate the actual displacement vectors
        // Note the sign inversion from original: -(delta_x * pm0) for p0 and +(delta_x * pm1) for p1
        // The 'effective_corr_scalar_part' already implicitly carries the sign of the desired correction.
        // So, we just multiply by the unit direction and inverse mass.
        __m256 delta_x = _mm256_mul_ps(delta_unit_x, effective_corr_scalar_part);
        __m256 delta_y = _mm256_mul_ps(delta_unit_y, effective_corr_scalar_part);
        __m256 delta_z = _mm256_mul_ps(delta_unit_z, effective_corr_scalar_part);

        // Apply correction mask (from original code, avoid division by zero artifacts where dist is ~0)
        __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
        
        delta_x = _mm256_and_ps(delta_x, apply_corr_mask);
        delta_y = _mm256_and_ps(delta_y, apply_corr_mask);
        delta_z = _mm256_and_ps(delta_z, apply_corr_mask);

        // Update positions in local buffer
        // p0 moves along -delta (subtract), p1 moves along +delta (add)
        _mm256_store_ps(&px_local[0], _mm256_sub_ps(p0x, _mm256_mul_ps(delta_x, p0m)));
        _mm256_store_ps(&py_local[0], _mm256_sub_ps(p0y, _mm256_mul_ps(delta_y, p0m)));
        _mm256_store_ps(&pz_local[0], _mm256_sub_ps(p0z, _mm256_mul_ps(delta_z, p0m)));

        _mm256_storeu_ps(&px_local[1], _mm256_add_ps(p1x, _mm256_mul_ps(delta_x, p1m)));
        _mm256_storeu_ps(&py_local[1], _mm256_add_ps(p1y, _mm256_mul_ps(delta_y, p1m)));
        _mm256_storeu_ps(&pz_local[1], _mm256_add_ps(p1z, _mm256_mul_ps(delta_z, p1m)));

        // Dual Update for Distance Constraints (Block 1) ---
        // Recalculate constraint error *after* position updates for accurate lambda update
        // Reload updated positions from local buffer to get the 'post-primal' state
        p0x = _mm256_load_ps(&px_local[0]);
        p0y = _mm256_load_ps(&py_local[0]);
        p0z = _mm256_load_ps(&pz_local[0]);
        p1x = _mm256_loadu_ps(&px_local[1]);
        p1y = _mm256_loadu_ps(&py_local[1]);
        p1z = _mm256_loadu_ps(&pz_local[1]);

        __m256 updated_dx = _mm256_sub_ps(p1x, p0x);
        __m256 updated_dy = _mm256_sub_ps(p1y, p0y);
        __m256 updated_dz = _mm256_sub_ps(p1z, p0z);

        __m256 updated_dist_sq = _mm256_mul_ps(updated_dx, updated_dx);
        updated_dist_sq = _mm256_fmadd_ps(updated_dy, updated_dy, updated_dist_sq);
        updated_dist_sq = _mm256_fmadd_ps(updated_dz, updated_dz, updated_dist_sq);
        
        __m256 updated_inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(updated_dist_sq, eps_vec));
        __m256 updated_dist = _mm256_rcp_ps(updated_inv_dist);
        
        __m256 C_x_updated = _mm256_sub_ps(updated_dist, rest_len_vec); // New error after primal update

        // Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated))
        __m256 new_lambda_vec = _mm256_fmadd_ps(current_k_vec, C_x_updated, current_lambda_vec);
        _mm256_store_ps(&cs->dist_lambda[r_base + 0], new_lambda_vec); // Store back

        // Update k_current (Stiffness ramping, Eq. 12)
        // k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|)
        __m256 abs_C_x_updated = _mm256_and_ps(C_x_updated, abs_mask_ps); // Absolute value using bitmask
        __m256 k_updated_vec = _mm256_fmadd_ps(beta_vec, abs_C_x_updated, current_k_vec);
        __m256 new_k_vec = _mm256_min_ps(k_max_vec, k_updated_vec);
        _mm256_store_ps(&cs->dist_k_current[r_base + 0], new_k_vec); // Store back
      }
      // Second block (i=8 to 14 of particles, processing constraints 8 to 14)
      {
        // p0 is aligned from px_local[8]
        __m256 p0x = _mm256_load_ps(&px_local[8]);
        __m256 p0y = _mm256_load_ps(&py_local[8]);
        __m256 p0z = _mm256_load_ps(&pz_local[8]);
        __m256 p0m = _mm256_load_ps(&pm_local[8]);

        // p1 is unaligned from px_local[9], *including* the dummy element at px_local[16]
        // when i=15, p_base+i+1 becomes p_base+16
        __m256 p1x = _mm256_loadu_ps(&px_local[9]);
        __m256 p1y = _mm256_loadu_ps(&py_local[9]);
        __m256 p1z = _mm256_loadu_ps(&pz_local[9]);
        __m256 p1m = _mm256_loadu_ps(&pm_local[9]);
        
        // This corresponds to rest_lengths[r_base + 8] through rest_lengths[r_base + 15]
        __m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + 8]);

        __m256 dx_vec = _mm256_sub_ps(p1x, p0x);
        __m256 dy_vec = _mm256_sub_ps(p1y, p0y);
        __m256 dz_vec = _mm256_sub_ps(p1z, p0z);

        __m256 dist_sq = _mm256_mul_ps(dx_vec, dx_vec);
        dist_sq = _mm256_fmadd_ps(dy_vec, dy_vec, dist_sq);
        dist_sq = _mm256_fmadd_ps(dz_vec, dz_vec, dist_sq);

        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));

        __m256 w_sum = _mm256_add_ps(p0m, p1m);
        __m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
        __m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped);

        __m256 dist = _mm256_rcp_ps(inv_dist);
        
        // Primal Update for Distance Constraints (Block 2) ---
        // Load AVBD state for this block of 8 constraints (indices r_base + 8 to r_base + 15)
        __m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + 8]);
        __m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + 8]);
        __m256 prev_frame_error_vec = _mm256_load_ps(&cs->dist_prev_frame_error[r_base + 8]);

        // Original constraint error C_j*(x) = current_dist - target_length
        __m256 C_x_original = _mm256_sub_ps(dist, rest_len_vec);

        // Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) (Eq. 18)
        __m256 C_x_regularized = _mm256_sub_ps(C_x_original, _mm256_mul_ps(alpha_vec, prev_frame_error_vec));

        // Calculate the total force term based on Augmented Lagrangian: -(k * C_reg + lambda)
        __m256 force_term = _mm256_fmadd_ps(current_k_vec, C_x_regularized, current_lambda_vec);
        
        // Calculate the scalar part of the correction, similar to original 'corr_scalar_part'
        __m256 effective_corr_scalar_part = _mm256_div_ps(force_term, current_k_vec); // Simplified displacement factor

        __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
        
        // This 'valid_mask' handles the fact that the last particle doesn't have a *next* particle
        // for a distance constraint. It effectively sets the correction for the last element (index 7 of the block) to zero.
        // For PARTICLES_PER_CHAIN = 16, this second block covers constraints for particles 8 to 15.
        // The constraint for particle 15 (index 7 in this block) is between particle 15 and 16 (the dummy).
        // The mask effectively disables the constraint correction for the dummy particle.
        __attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f };
        __m256 valid_mask = _mm256_load_ps(valid_mask_array);
        apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask);

        __m256 delta_unit_x = _mm256_mul_ps(dx_vec, inv_dist);
        __m256 delta_unit_y = _mm256_mul_ps(dy_vec, inv_dist);
        __m256 delta_unit_z = _mm256_mul_ps(dz_vec, inv_dist);

        __m256 delta_x = _mm256_mul_ps(delta_unit_x, effective_corr_scalar_part);
        __m256 delta_y = _mm256_mul_ps(delta_unit_y, effective_corr_scalar_part);
        __m256 delta_z = _mm256_mul_ps(delta_unit_z, effective_corr_scalar_part);

        delta_x = _mm256_and_ps(delta_x, apply_corr_mask);
        delta_y = _mm256_and_ps(delta_y, apply_corr_mask);
        delta_z = _mm256_and_ps(delta_z, apply_corr_mask);

        // Update positions in local buffer
        _mm256_store_ps(&px_local[8], _mm256_sub_ps(p0x, _mm256_mul_ps(delta_x, p0m)));
        _mm256_store_ps(&py_local[8], _mm256_sub_ps(p0y, _mm256_mul_ps(delta_y, p0m)));
        _mm256_store_ps(&pz_local[8], _mm256_sub_ps(p0z, _mm256_mul_ps(delta_z, p0m)));

        _mm256_storeu_ps(&px_local[9], _mm256_add_ps(p1x, _mm256_mul_ps(delta_x, p1m)));
        _mm256_storeu_ps(&py_local[9], _mm256_add_ps(py1, _mm256_mul_ps(delta_y, p1m)));
        _mm256_storeu_ps(&pz_local[9], _mm256_add_ps(pz1, _mm256_mul_ps(delta_z, p1m)));

        // Recalculate constraint error *after* position updates for accurate lambda update
        // Reload updated positions from local buffer
        p0x = _mm256_load_ps(&px_local[8]);
        p0y = _mm256_load_ps(&py_local[8]);
        p0z = _mm256_load_ps(&pz_local[8]);
        p1x = _mm256_loadu_ps(&px_local[9]);
        p1y = _mm256_loadu_ps(&py_local[9]);
        p1z = _mm256_loadu_ps(&pz_local[9]);

        updated_dx = _mm256_sub_ps(p1x, p0x);
        updated_dy = _mm256_sub_ps(p1y, p0y);
        updated_dz = _mm256_sub_ps(p1z, p0z);

        updated_dist_sq = _mm256_mul_ps(updated_dx, updated_dx);
        updated_dist_sq = _mm256_fmadd_ps(updated_dy, updated_dy, updated_dist_sq);
        updated_dist_sq = _mm256_fmadd_ps(updated_dz, updated_dz, updated_dist_sq);
        
        updated_inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(updated_dist_sq, eps_vec));
        updated_dist = _mm256_rcp_ps(updated_inv_dist);
        
        __m256 C_x_updated = _mm256_sub_ps(updated_dist, rest_len_vec); // New error after primal update
        // Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated))
        __m256 new_lambda_vec = _mm256_fmadd_ps(current_k_vec, C_x_updated, current_lambda_vec);
        _mm256_store_ps(&cs->dist_lambda[r_base + 8], new_lambda_vec); // Store back

        // Update k_current (Stiffness ramping, Eq. 12)
        // k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|)
        __m256 abs_C_x_updated = _mm256_and_ps(C_x_updated, abs_mask_ps); // Absolute value using bitmask
        __m256 k_updated_vec = _mm256_fmadd_ps(beta_vec, abs_C_x_updated, current_k_vec);
        __m256 new_k_vec = _mm256_min_ps(k_max_vec, k_updated_vec);
        _mm256_store_ps(&cs->dist_k_current[r_base + 8], new_k_vec); // Store back
      } // End of second block
    }
    // After all iterations, save the final positions from local buffer to global cs->p_pos_x/y/z
    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
      _mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_load_ps(&px_local[i]));
      _mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_load_ps(&py_local[i]));
      _mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_load_ps(&pz_local[i]));
    }
  }
  // Step 4: Sphere Collisions with Friction
  for (int c = 0; c < MAX_CHAINS; c++) {
    int p_base = c * PARTICLES_PER_CHAIN;
    int s_base = c * SPHERES_PER_CHAIN;
    struct chain_cfg *cfg = &cs->chain_configs[c];
    __m256 friction_vec = _mm256_set1_ps(cfg->friction);

    for (int s = 0; s < SPHERES_PER_CHAIN; s++) {
      __m256 sphere_x_vec = _mm256_set1_ps(cs->sphere_x[s_base + s]);
      __m256 sphere_y_vec = _mm256_set1_ps(cs->sphere_y[s_base + s]);
      __m256 sphere_z_vec = _mm256_set1_ps(cs->sphere_z[s_base + s]);
      __m256 sphere_r_vec = _mm256_set1_ps(cs->sphere_radius[s_base + s]);

      __m256 alpha_coll_vec = _mm256_set1_ps(cfg->avbd_alpha);
      __m256 beta_coll_vec = _mm256_set1_ps(cfg->avbd_beta);
      __m256 k_max_coll_vec = _mm256_set1_ps(cfg->avbd_k_max);

      for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
        __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
        __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
        __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
        
        __m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[p_base + i]);
        __m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[p_base + i]);
        __m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[p_base + i]);

        __m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[p_base + i]);
        p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);

        __m256 dx = _mm256_sub_ps(p_curr_x, sphere_x_vec);
        __m256 dy = _mm256_sub_ps(p_curr_y, sphere_y_vec);
        __m256 dz = _mm256_sub_ps(p_curr_z, sphere_z_vec);

        __m256 dist_sq = _mm256_mul_ps(dx, dx);
        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);

        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
        __m256 dist = _mm256_rcp_ps(inv_dist);

        __m256 penetration = _mm256_sub_ps(sphere_r_vec, dist);
        __m256 collision_mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ);

        __m256 normal_x = _mm256_mul_ps(dx, inv_dist);
        __m256 normal_y = _mm256_mul_ps(dy, inv_dist);
        __m256 normal_z = _mm256_mul_ps(dz, inv_dist);

        // Load AVBD state for this particle block
        __m256 coll_lambda_vec = _mm256_load_ps(&cs->coll_lambda[p_base + i]);
        __m256 coll_k_current_vec = _mm256_load_ps(&cs->coll_k_current[p_base + i]);
        __m256 coll_prev_frame_error_vec = _mm256_load_ps(&cs->coll_prev_frame_error[p_base + i]);

        // AVBD Primal Update:
        // Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame)
        __m256 C_x_regularized = _mm256_sub_ps(penetration, _mm256_mul_ps(alpha_coll_vec, coll_prev_frame_error_vec));
        
        // Apply collision_mask to regularized error, so only active constraints contribute.
        // This ensures C_x_regularized is 0 for non-penetrating particles, making force_term 0.
        C_x_regularized = _mm256_and_ps(C_x_regularized, collision_mask);

        // Calculate force term: (k * C_reg + lambda)
        __m256 force_term = _mm256_fmadd_ps(coll_k_current_vec, C_x_regularized, coll_lambda_vec);

        // Calculate correction magnitude (delta_p in paper): force_term * inv_mass / k_current
        // For collision, delta_x = -nablaC_j(x) * (H_j_inv * (lambda_j + K_j C_j(x))).
        // H_j_inv for a single particle collision is its inverse mass (p_inv_mass).
        // So, the correction should be along the normal: -normal * (force_term * p_inv_mass) / k_current
        // The force_term already has the appropriate sign for correction direction when C_x_regularized is positive (penetration).
        __m256 correction_magnitude_scalar = _mm256_div_ps(_mm256_mul_ps(force_term, p_inv_mass), coll_k_current_vec);

        // Apply the correction along the normal direction.
        // The sign of 'penetration' (and thus 'C_x_regularized' and 'force_term')
        // means positive 'correction_magnitude_scalar' should push OUT of collision.
        // So we subtract from particle position.
        __m256 delta_pos_x = _mm256_mul_ps(normal_x, correction_magnitude_scalar);
        __m256 delta_pos_y = _mm256_mul_ps(normal_y, correction_magnitude_scalar);
        __m256 delta_pos_z = _mm256_mul_ps(normal_z, correction_magnitude_scalar);
        
        // Update positions: p_curr = p_curr - delta_pos (particle moves out of penetration)
        p_curr_x = _mm256_sub_ps(p_curr_x, delta_pos_x);
        p_curr_y = _mm256_sub_ps(p_curr_y, delta_pos_y);
        p_curr_z = _mm256_sub_ps(p_curr_z, delta_pos_z);
         
        _mm256_store_ps(&cs->p_pos_x[p_base + i], p_curr_x);
        _mm256_store_ps(&cs->p_pos_y[p_base + i], p_curr_y);
        _mm256_store_ps(&cs->p_pos_z[p_base + i], p_curr_z);

        // Apply friction (post-stabilization step)
        // Recalculate velocity based on updated positions
        __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
        __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
        __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);

        // Project velocity onto normal to get normal component
        __m256 vel_dot_normal = _mm256_mul_ps(vel_x, normal_x);
        vel_dot_normal = _mm256_fmadd_ps(vel_y, normal_y, vel_dot_normal);
        vel_dot_normal = _mm256_fmadd_ps(vel_z, normal_z, vel_dot_normal);

        // Get tangential velocity component
        __m256 tangent_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_normal, normal_x));
        __m256 tangent_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_normal, normal_y));
        __m256 tangent_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_normal, normal_z));

        __m256 tangent_mag_sq = _mm256_mul_ps(tangent_x, tangent_x);
        tangent_mag_sq = _mm256_fmadd_ps(tangent_y, tangent_y, tangent_mag_sq);
        tangent_mag_sq = _mm256_fmadd_ps(tangent_z, tangent_z, tangent_mag_sq);

        __m256 inv_tangent_mag = _mm256_rsqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec));

        // Calculate friction force magnitude limit (based on normal force proxy)
        __m256 friction_mag_limit = _mm256_mul_ps(penetration, friction_vec); 
        
        // Actual tangential velocity length
        __m256 actual_tangent_mag = _mm256_mul_ps(_mm256_sqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec)), one_vec); // sqrt(tangent_mag_sq) is needed here

        // Calculate the scaling factor for friction. If actual_tangent_mag is greater than limit, scale down.
        __m256 friction_scale = _mm256_div_ps(friction_mag_limit, _mm256_max_ps(actual_tangent_mag, eps_vec));
        friction_scale = _mm256_min_ps(friction_scale, one_vec); // Clamp to 1.0 (no acceleration from friction)

        // Apply friction as a reduction of tangential velocity, converting to position change
        // The idea is to reduce the tangential displacement that happened *this* iteration.
        // The original `vel_x/y/z` are already displacements from `p_prev` to `p_curr`.
        // So, the tangential displacement is `tangent_x/y/z`.
        // We want to reduce this by `(1 - friction_scale)`.
        __m256 friction_x_corr = _mm256_mul_ps(tangent_x, friction_scale);
        __m256 friction_y_corr = _mm256_mul_ps(tangent_y, friction_scale);
        __m256 friction_z_corr = _mm256_mul_ps(tangent_z, friction_scale);
        
        // Only apply if actively colliding
        friction_x_corr = _mm256_and_ps(friction_x_corr, collision_mask);
        friction_y_corr = _mm256_and_ps(friction_y_corr, collision_mask);
        friction_z_corr = _mm256_and_ps(friction_z_corr, collision_mask);

        // Apply friction by moving the particle along the tangential direction.
        // If friction_scale is 0 (no friction), no change. If 1 (full friction), moves back to p_prev.
        p_curr_x = _mm256_sub_ps(p_curr_x, friction_x_corr);
        p_curr_y = _mm256_sub_ps(p_curr_y, friction_y_corr);
        p_curr_z = _mm256_sub_ps(p_curr_z, friction_z_corr);

        _mm256_store_ps(&cs->p_pos_x[p_base + i], p_curr_x);
        _mm256_store_ps(&cs->p_pos_y[p_base + i], p_curr_y);
        _mm256_store_ps(&cs->p_pos_z[p_base + i], p_curr_z);

        // Dual Update for Collision Constraints ---
        // Recalculate collision error *after* position updates (normal & friction)
        // Reload updated positions
        p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
        p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
        p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);

        dx = _mm256_sub_ps(p_curr_x, sphere_x_vec);
        dy = _mm256_sub_ps(p_curr_y, sphere_y_vec);
        dz = _mm256_sub_ps(p_curr_z, sphere_z_vec);

        dist_sq = _mm256_mul_ps(dx, dx);
        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);

        inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
        dist = _mm256_rcp_ps(inv_dist);
        
        __m256 C_x_updated_collision = _mm256_sub_ps(sphere_r_vec, dist); // New error after primal update

        // Apply the collision mask again for updates, so only active constraints update their AVBD state
        C_x_updated_collision = _mm256_and_ps(C_x_updated_collision, collision_mask);

        // Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated))
        __m256 new_coll_lambda_vec = _mm256_fmadd_ps(coll_k_current_vec, C_x_updated_collision, coll_lambda_vec);
        _mm256_store_ps(&cs->coll_lambda[p_base + i], new_coll_lambda_vec);

        // Update k_current (Stiffness ramping, Eq. 12)
        // k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|)
        __m256 abs_C_x_updated_collision = _mm256_and_ps(C_x_updated_collision, abs_mask_ps); // Absolute value
        __m256 k_updated_coll_vec = _mm256_fmadd_ps(beta_coll_vec, abs_C_x_updated_collision, coll_k_current_vec);
        __m256 new_coll_k_vec = _mm256_min_ps(k_max_coll_vec, k_updated_coll_vec);
        _mm256_store_ps(&cs->coll_k_current[p_base + i], new_coll_k_vec);
        // --- END AVBD ADDITION ---
      }
    }
  }
  // Step 5: Motion Constraints
  for (int c = 0; c < MAX_CHAINS; c++) {
    int p_base = c * PARTICLES_PER_CHAIN;
    for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
      __m256 px = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
      __m256 py = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
      __m256 pz = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
      __m256 pm = _mm256_load_ps(&cs->inv_mass[p_base + i]);
      pm = _mm256_max_ps(_mm256_min_ps(pm, inv_mass_clamp_max), inv_mass_clamp_min);

      __m256 rx = _mm256_load_ps(&cs->rest_pos_x[p_base + i]);
      __m256 ry = _mm256_load_ps(&cs->rest_pos_y[p_base + i]);
      __m256 rz = _mm256_load_ps(&cs->rest_pos_z[p_base + i]);
      __m256 radius_vec = _mm256_load_ps(&cs->motion_radius[p_base + i]);

      __m256 dx = _mm256_sub_ps(px, rx);
      __m256 dy = _mm256_sub_ps(py, ry);
      __m256 dz = _mm256_sub_ps(pz, rz);

      __m256 dist_sq = _mm256_mul_ps(dx, dx);
      dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
      dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);

      __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
      __m256 dist = _mm256_rcp_ps(inv_dist);

      __m256 penetration = _mm256_sub_ps(dist, radius_vec);
      __m256 mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ);

      __m256 corr_factor = _mm256_mul_ps(penetration, inv_dist);
      corr_factor = _mm256_and_ps(corr_factor, mask);

      __m256 delta_x = _mm256_mul_ps(dx, corr_factor);
      __m256 delta_y = _mm256_mul_ps(dy, corr_factor);
      __m256 delta_z = _mm256_mul_ps(dz, corr_factor);

      delta_x = _mm256_mul_ps(delta_x, pm);
      delta_y = _mm256_mul_ps(delta_y, pm);
      delta_z = _mm256_mul_ps(delta_z, pm);

      _mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_sub_ps(px, delta_x));
      _mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_sub_ps(py, delta_y));
      _mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_sub_ps(pz, delta_z));
    }
  }
  // Store Final Errors for Next Frame's Regularization ---
  // This happens once at the very end of simulate_chains, after all iterations are complete for the frame.
  for (int c = 0; c < MAX_CHAINS; ++c) {
    int p_base = c * PARTICLES_PER_CHAIN;
    int r_base = c * CONSTRAINTS_PER_CHAIN;
    // Loop through each block of 8 constraints (distance constraints are N-1 for N particles)
    for (int i = 0; i < CONSTRAINTS_PER_CHAIN - 1; i += VEC_FLOAT_COUNT) {
      // Calculate the current error for this distance constraint block
      // (Similar to C_x_original calculation from Step 3, but using final positions)
      // Load final positions for p0 and p1 (from global arrays now, as local buffer is stale)
      __m256 px0 = _mm256_load_ps(&cs->p_pos_x[p_base + i + 0]);
      __m256 py0 = _mm256_load_ps(&cs->p_pos_y[p_base + i + 0]);
      __m256 pz0 = _mm256_load_ps(&cs->p_pos_z[p_base + i + 0]);

      __m256 px1 = _mm256_load_ps(&cs->p_pos_x[p_base + i + 1]); // Use _mm256_load_ps as this is from global
      __m256 py1 = _mm256_load_ps(&cs->p_pos_y[p_base + i + 1]);
      __m256 pz1 = _mm256_load_ps(&cs->p_pos_z[p_base + i + 1]);

      __m256 diff_x = _mm256_sub_ps(px1, px0);
      __m256 diff_y = _mm256_sub_ps(py1, py0);
      __m256 diff_z = _mm256_sub_ps(pz1, pz0);

      __m256 dist_sq = _mm256_mul_ps(diff_x, diff_x);
      dist_sq = _mm256_fmadd_ps(diff_y, diff_y, dist_sq);
      dist_sq = _mm256_fmadd_ps(diff_z, diff_z, dist_sq);

      __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
      __m256 dist = _mm256_rcp_ps(inv_dist);
      __m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + i]);
      __m256 final_error_vec = _mm256_sub_ps(dist, rest_len_vec);
      _mm256_store_ps(&cs->dist_prev_frame_error[r_base + i], final_error_vec); // Store for next frame's regularization
    }
  }
  for (int c_idx = 0; c_idx < MAX_CHAINS; ++c_idx) {
    int p_base = c_idx * PARTICLES_PER_CHAIN;
    int s_base = c_idx * SPHERES_PER_CHAIN; // For accessing sphere data for this chain
    for (int i = 0; i < PARTICLES_PER_CHAIN; i += VEC_FLOAT_COUNT) {
      __m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
      __m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
      __m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
      __m256 deepest_penetration_vec = neg_flt_max_vec; // Initialize with negative infinity
      // Loop through all spheres for this chain to find the max penetration for each particle in the block
      for (int s_idx = 0; s_idx < SPHERES_PER_CHAIN; ++s_idx) {
        __m256 s_x = _mm256_set1_ps(cs->sphere_x[s_base + s_idx]);
        __m256 s_y = _mm256_set1_ps(cs->sphere_y[s_base + s_idx]);
        __m256 s_z = _mm256_set1_ps(cs->sphere_z[s_base + s_idx]);
        __m256 s_r = _mm256_set1_ps(cs->sphere_radius[s_base + s_idx]);

        __m256 dx_s = _mm256_sub_ps(p_curr_x, s_x);
        __m256 dy_s = _mm256_sub_ps(p_curr_y, s_y);
        __m256 dz_s = _mm256_sub_ps(p_curr_z, s_z);

        __m256 dist_sq_s = _mm256_mul_ps(dx_s, dx_s);
        dist_sq_s = _mm256_fmadd_ps(dy_s, dy_s, dist_sq_s);
        dist_sq_s = _mm256_fmadd_ps(dz_s, dz_s, dist_sq_s);

        __m256 inv_dist_s = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq_s, eps_vec));
        __m256 dist_s = _mm256_rcp_ps(inv_dist_s);

        __m256 current_penetration_s = _mm256_sub_ps(s_r, dist_s); // C(x) = radius - distance
        
        // We only care about positive penetration (i.e., actually colliding)
        current_penetration_s = _mm256_max_ps(zero_vec, current_penetration_s);
        deepest_penetration_vec = _mm256_max_ps(deepest_penetration_vec, current_penetration_s);
      }
      _mm256_store_ps(&cs->coll_prev_frame_error[p_base + i], deepest_penetration_vec); // Store for next frame's regularization
    }
  }
 }
diff --git a/chain_avx2.c b/chain_avx2.c
 #include <immintrin.h>
 #include <math.h>
 #include <assert.h>
 #include <string.h>
 #include <float.h>

 enum {
  MAX_CHNS      = 64,
  PTC_PER_CHN   = 16,
  SPH_PER_CHN   = 8,
  MAX_PTC       = (MAX_CHNS * PTC_PER_CHN),
  MAX_SPH       = (MAX_CHNS * SPH_PER_CHN),
  CON_PER_CHN   = PTC_PER_CHN,
  MAX_REST_LEN  = (MAX_CHNS * CON_PER_CHN),
  MAX_ITR       = 16,
 };
 struct chn_cfg {
  float damping;              // [0, 1]
  float stiffness;            // [0, 1]
  float stretch_factor;
  float max_strain;
  float friction;             // [0, 1]
  float linear_inertia;       // [0, 1], 1.0f = physically correct linear motion
  float angular_inertia;      // [0, 1], 1.0f = physically correct angular rotation
  float centrifugal_inertia;  // [0, 1], 1.0f = physically correct centrifugal force
 };
 struct chn_sim {
  unsigned char free_idx_cnt;
  unsigned char free_idx[MAX_CHNS];
  struct chn_cfg chn_cfg[MAX_CHNS] __attribute__((aligned(32)));

  // chain global forces (world space gravity and wind)
  float chn_grav_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_grav_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_grav_z[MAX_CHNS] __attribute__((aligned(32)));

  float chn_wnd_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_wnd_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_wnd_z[MAX_CHNS] __attribute__((aligned(32)));

  // chain transformations (world space)
  float chn_pos_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_pos_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_pos_z[MAX_CHNS] __attribute__((aligned(32)));
  
  float chn_quat_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_quat_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_quat_z[MAX_CHNS] __attribute__((aligned(32)));
  float chn_quat_w[MAX_CHNS] __attribute__((aligned(32)));
  
  float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32)));
  
  float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32)));
  float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32)));

  // particle positions (model space)
  float ptc_pos_x[MAX_PTC] __attribute__((aligned(32)));
  float ptc_pos_y[MAX_PTC] __attribute__((aligned(32)));
  float ptc_pos_z[MAX_PTC] __attribute__((aligned(32)));
  float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32)));
  
  float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32)));
  float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32)));
  float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32)));
  float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32)));

  // sphere positions (model space)
  float sph_x[MAX_SPH] __attribute__((aligned(32)));
  float sph_y[MAX_SPH] __attribute__((aligned(32)));
  float sph_z[MAX_SPH] __attribute__((aligned(32)));
  float sph_r[MAX_SPH] __attribute__((aligned(32)));

  // rest positions (model space)
  float rest_pos_x[MAX_PTC] __attribute__((aligned(32)));
  float rest_pos_y[MAX_PTC] __attribute__((aligned(32)));
  float rest_pos_z[MAX_PTC] __attribute__((aligned(32)));
  float motion_radius[MAX_PTC] __attribute__((aligned(32)));
 };
 extern void
 chn_sim_init(struct chn_sim *cns) {
  assert(cns);
  cns->free_idx_cnt = MAX_CHNS;
  for (int i = 0; i < MAX_CHNS; ++i) {
    cns->free_idx[i] = MAX_CHNS - i - 1;
  }
  for (int i = 0; i < MAX_CHNS; i += 8) {
    _mm256_store_ps(&cns->chn_quat_w[i], _mm256_set1_ps(1.0f));
    _mm256_store_ps(&cns->chn_prev_quat_w[i], _mm256_set1_ps(1.0f));  
  }
 }
 extern int
 chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) {
  assert(cns);
  assert(cfg);
  assert(cns->free_idx_cnt > 0);
  assert(cnt <= PTC_PER_CHN);
  assert(cnt > 1);

  int chn = cns->free_idx[--cns->free_idx_cnt];
  cns->chn_cfg[chn] = *cfg;

  int p_idx = (chn * PTC_PER_CHN);
  int r_idx = (chn * CON_PER_CHN);
  for (int i = 0; i < cnt; ++i) {
    cns->ptc_pos_x[p_idx + i] = rest_pos[i*4+0];
    cns->ptc_pos_y[p_idx + i] = rest_pos[i*4+1];
    cns->ptc_pos_z[p_idx + i] = rest_pos[i*4+2];
    cns->ptc_inv_mass[p_idx + i] = rest_pos[i*4+3];

    cns->rest_pos_x[p_idx + i] = rest_pos[i*4+0];
    cns->rest_pos_y[p_idx + i] = rest_pos[i*4+1];
    cns->rest_pos_z[p_idx + i] = rest_pos[i*4+2];
  }
  for (int i = cnt; i < PTC_PER_CHN; ++i) {
    cns->ptc_pos_x[p_idx + i] = 0.0f;
    cns->ptc_pos_y[p_idx + i] = 0.0f;
    cns->ptc_pos_z[p_idx + i] = 0.0f;
  }
  for (int i = 1; i < cnt; ++i) {
    float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0];
    float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1];
    float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2];
    cns->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz);
  }
  for (int i = cnt; i < CON_PER_CHN; ++i) {
    cns->ptc_rest_len[r_idx + i] = 0.0f;
    cns->ptc_rest_len[r_idx + i] = 0.0f;
    cns->ptc_rest_len[r_idx + i] = 0.0f;
  }
  return chn;
 }
 extern void
 chn_sim_del(struct chn_sim *cns, int chn) {
  assert(cns);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);

  int p_idx = (chn * PTC_PER_CHN);
  int s_idx = (chn * SPH_PER_CHN);
  int c_idx = (chn * CON_PER_CHN);

  cns->free_idx[cns->free_idx_cnt++] = chn;
  memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0]));

  cns->chn_grav_x[chn] = 0.0f;
  cns->chn_grav_y[chn] = 0.0f;
  cns->chn_grav_z[chn] = 0.0f;

  cns->chn_wnd_x[chn] = 0.0f;
  cns->chn_wnd_y[chn] = 0.0f;
  cns->chn_wnd_z[chn] = 0.0f;

  cns->chn_pos_x[chn] = 0.0f;
  cns->chn_pos_y[chn] = 0.0f;
  cns->chn_pos_z[chn] = 0.0f;

  cns->chn_quat_x[chn] = 0.0f;
  cns->chn_quat_y[chn] = 0.0f;
  cns->chn_quat_z[chn] = 0.0f;
  cns->chn_quat_w[chn] = 1.0f;

  cns->chn_prev_pos_x[chn] = 0.0f;
  cns->chn_prev_pos_y[chn] = 0.0f;
  cns->chn_prev_pos_z[chn] = 0.0f;

  cns->chn_prev_quat_x[chn] = 0.0f;
  cns->chn_prev_quat_y[chn] = 0.0f;
  cns->chn_prev_quat_z[chn] = 0.0f;
  cns->chn_prev_quat_w[chn] = 1.0f;

  _mm256_store_ps(&cns->ptc_pos_x[p_idx + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_pos_x[p_idx + 8], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_pos_y[p_idx + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_pos_y[p_idx + 8], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_pos_z[p_idx + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_pos_z[p_idx + 8], _mm256_set1_ps(0.0f));

  _mm256_store_ps(&cns->ptc_inv_mass[p_idx + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_inv_mass[p_idx + 8], _mm256_set1_ps(0.0f));

  _mm256_store_ps(&cns->ptc_prev_pos_x[p_idx + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_prev_pos_x[p_idx + 8], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_prev_pos_y[p_idx + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_prev_pos_y[p_idx + 8], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_prev_pos_z[p_idx + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_prev_pos_z[p_idx + 8], _mm256_set1_ps(0.0f));

  _mm256_store_ps(&cns->rest_pos_x[p_idx + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->rest_pos_x[p_idx + 8], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->rest_pos_y[p_idx + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->rest_pos_y[p_idx + 8], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->rest_pos_z[p_idx + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->rest_pos_z[p_idx + 8], _mm256_set1_ps(0.0f));

  _mm256_store_ps(&cns->motion_radius[p_idx + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->motion_radius[p_idx + 8], _mm256_set1_ps(0.0f));

  _mm256_store_ps(&cns->sph_x[s_idx], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->sph_y[s_idx], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->sph_z[s_idx], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->sph_r[s_idx], _mm256_set1_ps(0.0f));

  _mm256_store_ps(&cns->ptc_rest_len[c_idx + 0], _mm256_set1_ps(0.0f));
  _mm256_store_ps(&cns->ptc_rest_len[c_idx + 8], _mm256_set1_ps(0.0f));
 }
 extern void
 chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) {
  assert(cns);
  assert(pos3);
  assert(rot4);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);

  cns->chn_pos_x[chn] = pos3[0];
  cns->chn_pos_y[chn] = pos3[1];
  cns->chn_pos_z[chn] = pos3[2];

  cns->chn_quat_x[chn] = rot4[0];
  cns->chn_quat_y[chn] = rot4[1];
  cns->chn_quat_z[chn] = rot4[2];
  cns->chn_quat_w[chn] = rot4[3];
 }
 extern void
 chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) {
  assert(cns);
  assert(pos3);
  assert(rot4);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);

  cns->chn_pos_x[chn] = cns->chn_prev_pos_x[chn] = pos3[0];
  cns->chn_pos_y[chn] = cns->chn_prev_pos_y[chn] = pos3[1];
  cns->chn_pos_z[chn] = cns->chn_prev_pos_z[chn] = pos3[2];

  cns->chn_quat_x[chn] = cns->chn_prev_quat_x[chn] = rot4[0];
  cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1];
  cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2];
  cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3];
 }
 extern void
 chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) {
  assert(cns);
  assert(grav3);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);

  cns->chn_grav_x[chn] = grav3[0];
  cns->chn_grav_y[chn] = grav3[1];
  cns->chn_grav_z[chn] = grav3[2];
 }
 extern void
 chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) {
  assert(cns);
  assert(wind3);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);

  cns->chn_wnd_x[chn] = wind3[0];
  cns->chn_wnd_y[chn] = wind3[1];
  cns->chn_wnd_z[chn] = wind3[2];
 }
 extern void
 chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) {
  assert(cns);
  assert(spheres);
  assert(cnt < MAX_SPH);

  int s_idx = (chn * SPH_PER_CHN);
  for (int i = 0; i < cnt; i++) {
    cns->sph_x[s_idx + i] = spheres[i*4+0];
    cns->sph_y[s_idx + i] = spheres[i*4+1];
    cns->sph_z[s_idx + i] = spheres[i*4+2];
    cns->sph_r[s_idx + i] = spheres[i*4+3];
  }
  for (int i = cnt; i < SPH_PER_CHN; ++i) {
    cns->sph_x[s_idx + i] = 0;
    cns->sph_y[s_idx + i] = 0;
    cns->sph_z[s_idx + i] = 0;
    cns->sph_r[s_idx + i] = 0;
  }
 }
 extern void
 chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) {
  assert(cns);
  assert(chn >= 0);
  assert(chn < MAX_CHNS);
  assert(cnt >= 0);
  assert(cnt < PTC_PER_CHN);

  int p_idx = (chn * PTC_PER_CHN);
  for (int i = 0; i < cnt; ++i) {
    cns->motion_radius[p_idx + i] = radius[i];
  }
  for (int i = cnt; i < PTC_PER_CHN; ++i) {
    cns->motion_radius[p_idx + i] = FLT_MAX;
  }
 }
 extern void
 chn_sim_run(struct chn_sim *cns, float dt) {
  // SIMD constants
  const __m256 dt_vec = _mm256_set1_ps(dt);
  const __m256 dt2_vec = _mm256_set1_ps(dt * dt);
  const __m256 one_vec = _mm256_set1_ps(1.0f);
  const __m256 neg_one_vec = _mm256_set1_ps(-1.0f);
  const __m256 eps_vec = _mm256_set1_ps(1e-6f);
  const __m256 ptc_inv_mass_clamp_min = _mm256_set1_ps(0.0f);
  const __m256 ptc_inv_mass_clamp_max = _mm256_set1_ps(1e6f);
  const __m256 zero_vec = _mm256_setzero_ps();
  const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt);
  const __m256 two_vec = _mm256_set1_ps(2.0f);
  const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f);
  const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f);
  const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));

  // Local arrays for PBD solver
  float px_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
  float py_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
  float pz_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));
  float pm_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32)));

  // Initialize dummy elements
  px_lcl[PTC_PER_CHN] = 0.0f;
  py_lcl[PTC_PER_CHN] = 0.0f;
  pz_lcl[PTC_PER_CHN] = 0.0f;
  pm_lcl[PTC_PER_CHN] = 0.0f;

  // Stack arrays for precomputed data
  float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_wnd_lcl_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_wnd_lcl_z[MAX_CHNS] __attribute__((aligned(32)));

  float chn_grav_lcl_x[MAX_CHNS] __attribute__((aligned(32)));
  float chn_grav_lcl_y[MAX_CHNS] __attribute__((aligned(32)));
  float chn_grav_lcl_z[MAX_CHNS] __attribute__((aligned(32)));

  float vel_x[MAX_CHNS] __attribute__((aligned(32)));
  float vel_y[MAX_CHNS] __attribute__((aligned(32)));
  float vel_z[MAX_CHNS] __attribute__((aligned(32)));

  float ang_vel_x[MAX_CHNS] __attribute__((aligned(32)));
  float ang_vel_y[MAX_CHNS] __attribute__((aligned(32)));
  float ang_vel_z[MAX_CHNS] __attribute__((aligned(32)));

  // Step 0: Precomputation (SIMD, 8 chains at once)
  for (int c = 0; c < MAX_CHNS; c += 8) {
    // Load chain quaternions
    __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]);
    __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]);
    __m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]);
    __m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]);

    // Create q_conjugate components (qw remains, qx, qy, qz are negated)
    __m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
    __m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
    __m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz

    // Compute local-space wind
    {
      __m256 vx = _mm256_load_ps(&cns->chn_wnd_x[c]);
      __m256 vy = _mm256_load_ps(&cns->chn_wnd_y[c]);
      __m256 vz = _mm256_load_ps(&cns->chn_wnd_z[c]);

      // Step 1: t = q_conj * v_world_as_quat
      // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
      // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
      // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
      // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
      // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
      __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
      tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy

      __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
      ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz

      __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
      tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx

      __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec);  // 0 - (-qx)*vx = qx*vx
      tw = _mm256_fnmadd_ps(cqy, vy, tw);  // qx*vx - (-qy)*vy = qx*vx + qy*vy
      tw = _mm256_fnmadd_ps(cqz, vz, tw);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz

      // Step 2: res_vec = (t * q)_vec
      // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
      // res_x = tw*qx + tx*qw + ty*qz - tz*qy
      // res_y = tw*qy + ty*qw + tz*qx - tx*qz
      // res_z = tw*qz + ty*qw + tx*qy - ty*qx
      __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
      res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x));
      __m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
      res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y));
      __m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
      res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z));

      _mm256_store_ps(&chn_wnd_lcl_x[c], res_x);
      _mm256_store_ps(&chn_wnd_lcl_y[c], res_y);
      _mm256_store_ps(&chn_wnd_lcl_z[c], res_z);
    }
    // Compute local-space gravity
    {
      __m256 vx = _mm256_load_ps(&cns->chn_grav_x[c]);
      __m256 vy = _mm256_load_ps(&cns->chn_grav_y[c]);
      __m256 vz = _mm256_load_ps(&cns->chn_grav_z[c]);

      // Step 1: t = q_conj * v_world_as_quat
      // t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
      // tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
      // tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
      // ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
      // tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx

      __m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
      tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy

      __m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
      ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz

      __m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
      tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx

      __m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec);  // 0 - (-qx)*vx = qx*vx
      tw = _mm256_fnmadd_ps(cqy, vy, tw);  // qx*vx - (-qy)*vy = qx*vx + qy*vy
      tw = _mm256_fnmadd_ps(cqz, vz, tw);  // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz

      // Step 2: res_vec = (t * q)_vec
      // result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
      // res_x = tw*qx + tx*qw + ty*qz - tz*qy
      // res_y = tw*qy + ty*qw + tz*qx - tx*qz
      // res_z = tw*qz + ty*qw + tx*qy - ty*qx

      __m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
      res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x));
      __m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
      res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y));
      __m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
      res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z));

      _mm256_store_ps(&chn_grav_lcl_x[c], res_x);
      _mm256_store_ps(&chn_grav_lcl_y[c], res_y);
      _mm256_store_ps(&chn_grav_lcl_z[c], res_z);
    }
    // Compute linear velocity
    {
      __m256 curr_x = _mm256_load_ps(&cns->chn_pos_x[c]);
      __m256 curr_y = _mm256_load_ps(&cns->chn_pos_y[c]);
      __m256 curr_z = _mm256_load_ps(&cns->chn_pos_z[c]);

      __m256 prev_x = _mm256_load_ps(&cns->chn_prev_pos_x[c]);
      __m256 prev_y = _mm256_load_ps(&cns->chn_prev_pos_y[c]);
      __m256 prev_z = _mm256_load_ps(&cns->chn_prev_pos_z[c]);

      __m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec);
      __m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec);
      __m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec);

      _mm256_store_ps(&vel_x[c], vel_x_vec);
      _mm256_store_ps(&vel_y[c], vel_y_vec);
      _mm256_store_ps(&vel_z[c], vel_z_vec);
    }
    // Compute angular velocity
    {
      __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]);
      __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]);
      __m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]);
      __m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]);

      __m256 prev_qx = _mm256_load_ps(&cns->chn_prev_quat_x[c]);
      __m256 prev_qy = _mm256_load_ps(&cns->chn_prev_quat_y[c]);
      __m256 prev_qz = _mm256_load_ps(&cns->chn_prev_quat_z[c]);
      __m256 prev_qw = _mm256_load_ps(&cns->chn_prev_quat_w[c]);

      // Step 1: Compute delta quaternion (to - from)
      __m256 dt_qx = _mm256_sub_ps(qx, prev_qx);
      __m256 dt_qy = _mm256_sub_ps(qy, prev_qy);
      __m256 dt_qz = _mm256_sub_ps(qz, prev_qz);
      __m256 dt_qw = _mm256_sub_ps(qw, prev_qw);

      // Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz)
      __m256 conj_qx = _mm256_sub_ps(zero_vec, prev_qx);
      __m256 conj_qy = _mm256_sub_ps(zero_vec, prev_qy);
      __m256 conj_qz = _mm256_sub_ps(zero_vec, prev_qz);
      __m256 conj_qw = prev_qw;

      // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
      // Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2)
      // Result: orientationDelta = (dt_qw, dt_qx, dt_qy, dt_qz) * (conj_qw, conj_qx, conj_qy, conj_qz)
      __m256 orient_qw = _mm256_fnmadd_ps(dt_qx, conj_qx, _mm256_mul_ps(dt_qw, conj_qw));
      orient_qw = _mm256_fnmadd_ps(dt_qy, conj_qy, orient_qw);
      orient_qw = _mm256_fnmadd_ps(dt_qz, conj_qz, orient_qw);

      __m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(dt_qy, conj_qz), _mm256_mul_ps(dt_qz, conj_qy));
      __m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(dt_qz, conj_qx), _mm256_mul_ps(dt_qx, conj_qz));
      __m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(dt_qx, conj_qy), _mm256_mul_ps(dt_qy, conj_qx));

      __m256 orient_qx = _mm256_fmadd_ps(dt_qw, conj_qx, _mm256_mul_ps(dt_qx, conj_qw));
      orient_qx = _mm256_add_ps(orient_qx, cross_x);
      __m256 orient_qy = _mm256_fmadd_ps(dt_qw, conj_qy, _mm256_mul_ps(dt_qy, conj_qw));
      orient_qy = _mm256_add_ps(orient_qy, cross_y);
      __m256 orient_qz = _mm256_fmadd_ps(dt_qw, conj_qz, _mm256_mul_ps(dt_qz, conj_qw));
      orient_qz = _mm256_add_ps(orient_qz, cross_z);

      // Step 4: Compute dot product (to, from) for shortest arc check
      __m256 dot = _mm256_fmadd_ps(qx, prev_qx, _mm256_mul_ps(qw, prev_qw));
      dot = _mm256_fmadd_ps(qy, prev_qy, dot);
      dot = _mm256_fmadd_ps(qz, prev_qz, dot);

      // Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment)
      __m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ);
      __m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec), 
                                       _mm256_andnot_ps(negate_mask, one_vec));

      orient_qx = _mm256_mul_ps(orient_qx, negate_sign);
      orient_qy = _mm256_mul_ps(orient_qy, negate_sign);
      orient_qz = _mm256_mul_ps(orient_qz, negate_sign);

      // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
      __m256 scale = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec));
      __m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale);
      __m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale);
      __m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale);

      // Store results
      _mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec);
      _mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec);
      _mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec);
    }
  }
  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
  for (int c = 0; c < MAX_CHNS; ++c) {
    int base_idx = c * PTC_PER_CHN;
    struct chn_cfg *cfg = &cns->chn_cfg[c];

    // Load precomputed velocities and inertia parameters
    __m256 vel_x_vec = _mm256_set1_ps(vel_x[c]);
    __m256 vel_y_vec = _mm256_set1_ps(vel_y[c]);
    __m256 vel_z_vec = _mm256_set1_ps(vel_z[c]);

    __m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]);
    __m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]);
    __m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]);

    __m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia);
    __m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia);
    __m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia);

    // Clamp inertia parameters to [0, 1]
    linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min);
    angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min);
    centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);

    for (int i = 0; i < PTC_PER_CHN; i += 8) {
      __m256 px = _mm256_load_ps(&cns->ptc_pos_x[base_idx + i]);
      __m256 py = _mm256_load_ps(&cns->ptc_pos_y[base_idx + i]);
      __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[base_idx + i]);
      __m256 p_ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[base_idx + i]);
      p_ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(p_ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);

      // Linear inertia: v * dt * linear_inertia
      __m256 lin_dt_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec);
      __m256 lin_dt_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec);
      __m256 lin_dt_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec);

      // Angular inertia: (ω × r) * dt * angular_inertia
      __m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
      __m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
      __m256 ang_dt_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));

      ang_dt_x = _mm256_mul_ps(_mm256_mul_ps(ang_dt_x, dt_vec), angular_inertia_vec);
      ang_dt_y = _mm256_mul_ps(_mm256_mul_ps(ang_dt_y, dt_vec), angular_inertia_vec);
      ang_dt_z = _mm256_mul_ps(_mm256_mul_ps(ang_dt_z, dt_vec), angular_inertia_vec);

      // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
      __m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
      __m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
      __m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));

      __m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y));
      __m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z));
      __m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x));

      // Calculate displacement: (ω × (ω × r)) * dt^2
      __m256 base_cent_dt_x = _mm256_mul_ps(cross2_x, dt2_vec);
      __m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec);
      __m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec);

      // Apply the centrifugal_inertia factor
      __m256 cent_dt_x = _mm256_mul_ps(base_cent_dt_x, centrifugal_inertia_vec);
      __m256 cent_dt_y = _mm256_mul_ps(base_cent_dt_y, centrifugal_inertia_vec);
      __m256 cent_dt_z = _mm256_mul_ps(base_cent_dt_z, centrifugal_inertia_vec);

      // Total delta
      __m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x);
      __m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y);
      __m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z);

      // Update positions
      _mm256_store_ps(&cns->ptc_pos_x[base_idx + i], _mm256_add_ps(px, dt_x));
      _mm256_store_ps(&cns->ptc_pos_y[base_idx + i], _mm256_add_ps(py, dt_y));
      _mm256_store_ps(&cns->ptc_pos_z[base_idx + i], _mm256_add_ps(pz, dt_z));
    }
    // Update previous transformation
    cns->chn_prev_pos_x[c] = cns->chn_pos_x[c];
    cns->chn_prev_pos_y[c] = cns->chn_pos_y[c];
    cns->chn_prev_pos_z[c] = cns->chn_pos_z[c];

    cns->chn_prev_quat_x[c] = cns->chn_quat_x[c];
    cns->chn_prev_quat_y[c] = cns->chn_quat_y[c];
    cns->chn_prev_quat_z[c] = cns->chn_quat_z[c];
    cns->chn_prev_quat_w[c] = cns->chn_quat_w[c];
  }
  // Step 2: Verlet Integration
  for (int c = 0; c < MAX_CHNS; ++c) {
    int base_idx = c * PTC_PER_CHN;
    struct chn_cfg *cfg = &cns->chn_cfg[c];

    __m256 chn_grav_x_vec = _mm256_set1_ps(chn_grav_lcl_x[c]);
    __m256 chn_grav_y_vec = _mm256_set1_ps(chn_grav_lcl_y[c]);
    __m256 chn_grav_z_vec = _mm256_set1_ps(chn_grav_lcl_z[c]);

    __m256 chn_wnd_x_vec = _mm256_set1_ps(chn_wnd_lcl_x[c]);
    __m256 chn_wnd_y_vec = _mm256_set1_ps(chn_wnd_lcl_y[c]);
    __m256 chn_wnd_z_vec = _mm256_set1_ps(chn_wnd_lcl_z[c]);
    __m256 damping_vec = _mm256_set1_ps(cfg->damping);

    for (int i = 0; i < PTC_PER_CHN; i += 8) {
      __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[base_idx + i]);
      __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[base_idx + i]);
      __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[base_idx + i]);
      __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[base_idx + i]);
      ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);

      __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[base_idx + i]);
      __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[base_idx + i]);
      __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[base_idx + i]);

      __m256 force_x = _mm256_add_ps(chn_grav_x_vec, chn_wnd_x_vec);
      __m256 force_y = _mm256_add_ps(chn_grav_y_vec, chn_wnd_y_vec);
      __m256 force_z = _mm256_add_ps(chn_grav_z_vec, chn_wnd_z_vec);

      __m256 acc_x = _mm256_mul_ps(force_x, ptc_inv_mass);
      __m256 acc_y = _mm256_mul_ps(force_y, ptc_inv_mass);
      __m256 acc_z = _mm256_mul_ps(force_z, ptc_inv_mass);

      __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
      __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
      __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);

      __m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec);
      __m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec);
      __m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec);

      __m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec);
      __m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec);
      __m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec);

      __m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x));
      __m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y));
      __m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z));

      _mm256_store_ps(&cns->ptc_pos_x[base_idx + i], new_p_x);
      _mm256_store_ps(&cns->ptc_pos_y[base_idx + i], new_p_y);
      _mm256_store_ps(&cns->ptc_pos_z[base_idx + i], new_p_z);

      _mm256_store_ps(&cns->ptc_prev_pos_x[base_idx + i], p_curr_x);
      _mm256_store_ps(&cns->ptc_prev_pos_y[base_idx + i], p_curr_y);
      _mm256_store_ps(&cns->ptc_prev_pos_z[base_idx + i], p_curr_z);
    }
  }
  // Step 3: Distance Constraints
  for (int c = 0; c < MAX_CHNS; ++c) {
    int p_base = c * PTC_PER_CHN;
    int r_base = c * CON_PER_CHN;

    struct chn_cfg *cfg = &cns->chn_cfg[c];
    __m256 stiff_vec = _mm256_set1_ps(cfg->stiffness);
    __m256 stretch_factor_vec = _mm256_set1_ps(cfg->stretch_factor);
    __m256 max_strain_vec_abs = _mm256_set1_ps(cfg->max_strain);
    __m256 neg_max_strain_vec_abs = _mm256_mul_ps(max_strain_vec_abs, neg_one_vec);

    for (int i = 0; i < PTC_PER_CHN; i += 8) {
      __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]);
      pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);
      _mm256_store_ps(&px_lcl[i], _mm256_load_ps(&cns->ptc_pos_x[p_base + i]));
      _mm256_store_ps(&py_lcl[i], _mm256_load_ps(&cns->ptc_pos_y[p_base + i]));
      _mm256_store_ps(&pz_lcl[i], _mm256_load_ps(&cns->ptc_pos_z[p_base + i]));
      _mm256_store_ps(&pm_lcl[i], pm);
    }
    for (int iter = 0; iter < MAX_ITR; ++iter) {
      // First block (i=0 to 7)
      {
        __m256 p0x = _mm256_load_ps(&px_lcl[0]);
        __m256 p0y = _mm256_load_ps(&py_lcl[0]);
        __m256 p0z = _mm256_load_ps(&pz_lcl[0]);
        __m256 p0m = _mm256_load_ps(&pm_lcl[0]);

        __m256 p1x = _mm256_loadu_ps(&px_lcl[1]);
        __m256 p1y = _mm256_loadu_ps(&py_lcl[1]);
        __m256 p1z = _mm256_loadu_ps(&pz_lcl[1]);
        __m256 p1m = _mm256_loadu_ps(&pm_lcl[1]);
        __m256 rest_len_vec = _mm256_load_ps(&cns->ptc_rest_len[r_base]);

        __m256 dx = _mm256_sub_ps(p1x, p0x);
        __m256 dy = _mm256_sub_ps(p1y, p0y);
        __m256 dz = _mm256_sub_ps(p1z, p0z);

        __m256 dist_sq = _mm256_mul_ps(dx, dx);
        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));

        __m256 w_sum = _mm256_add_ps(p0m, p1m);
        __m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
        __m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped);

        __m256 dist = _mm256_rcp_ps(inv_dist);
        __m256 diff = _mm256_sub_ps(dist, rest_len_vec);
        __m256 max_rest_len_eps = _mm256_max_ps(rest_len_vec, eps_vec);
        __m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps));
        __m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs));

        __m256 dyn_stiff_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps));
        __m256 dyn_stiff = _mm256_mul_ps(stiff_vec, _mm256_rcp_ps(dyn_stiff_denom));

        __m256 corr_scl_part = _mm256_mul_ps(dyn_stiff, stretch_factor_vec);
        corr_scl_part = _mm256_mul_ps(corr_scl_part, rcp_w_sum);
        __m256 corr_magnitude = _mm256_mul_ps(diff, corr_scl_part);

        __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);

        __m256 dt_unit_x = _mm256_mul_ps(dx, inv_dist);
        __m256 dt_unit_y = _mm256_mul_ps(dy, inv_dist);
        __m256 dt_unit_z = _mm256_mul_ps(dz, inv_dist);

        __m256 dt_x = _mm256_mul_ps(dt_unit_x, corr_magnitude);
        __m256 dt_y = _mm256_mul_ps(dt_unit_y, corr_magnitude);
        __m256 dt_z = _mm256_mul_ps(dt_unit_z, corr_magnitude);

        dt_x = _mm256_and_ps(dt_x, apply_corr_mask);
        dt_y = _mm256_and_ps(dt_y, apply_corr_mask);
        dt_z = _mm256_and_ps(dt_z, apply_corr_mask);

        _mm256_store_ps(&px_lcl[0], _mm256_fmadd_ps(dt_x, p0m, p0x));
        _mm256_store_ps(&py_lcl[0], _mm256_fmadd_ps(dt_y, p0m, p0y));
        _mm256_store_ps(&pz_lcl[0], _mm256_fmadd_ps(dt_z, p0m, p0z));

        _mm256_store_ps(&px_lcl[1], _mm256_fnmadd_ps(dt_x, p1m, p1x));
        _mm256_store_ps(&py_lcl[1], _mm256_fnmadd_ps(dt_y, p1m, p1y));
        _mm256_store_ps(&pz_lcl[1], _mm256_fnmadd_ps(dt_z, p1m, p1z));
      }
      // Second block (i=8 to 14)
      {
        __m256 p0x = _mm256_load_ps(&px_lcl[8]);
        __m256 p0y = _mm256_load_ps(&py_lcl[8]);
        __m256 p0z = _mm256_load_ps(&pz_lcl[8]);
        __m256 p0m = _mm256_load_ps(&pm_lcl[8]);

        __m256 p1x = _mm256_loadu_ps(&px_lcl[9]);
        __m256 p1y = _mm256_loadu_ps(&py_lcl[9]);
        __m256 p1z = _mm256_loadu_ps(&pz_lcl[9]);
        __m256 p1m = _mm256_loadu_ps(&pm_lcl[9]);
        __m256 rest_len_vec = _mm256_load_ps(&cns->ptc_rest_len[r_base + 8]);

        __m256 dx = _mm256_sub_ps(p1x, p0x);
        __m256 dy = _mm256_sub_ps(p1y, p0y);
        __m256 dz = _mm256_sub_ps(p1z, p0z);

        __m256 dist_sq = _mm256_mul_ps(dx, dx);
        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));

        __m256 w_sum = _mm256_add_ps(p0m, p1m);
        __m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
        __m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped);

        __m256 dist = _mm256_rcp_ps(inv_dist);
        __m256 diff = _mm256_sub_ps(dist, rest_len_vec);
        __m256 max_rest_len_eps = _mm256_max_ps(rest_len_vec, eps_vec);
        __m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps));
        __m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs));

        __m256 dyn_stiff_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps));
        __m256 dyn_stiff = _mm256_mul_ps(stiff_vec, _mm256_rcp_ps(dyn_stiff_denom));

        __m256 corr_scl_part = _mm256_mul_ps(dyn_stiff, stretch_factor_vec);
        corr_scl_part = _mm256_mul_ps(corr_scl_part, rcp_w_sum);
        __m256 corr_magnitude = _mm256_mul_ps(diff, corr_scl_part);

        __m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);

        __attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f };
        __m256 valid_mask = _mm256_load_ps(valid_mask_array);
        apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask);

        __m256 dt_unit_x = _mm256_mul_ps(dx, inv_dist);
        __m256 dt_unit_y = _mm256_mul_ps(dy, inv_dist);
        __m256 dt_unit_z = _mm256_mul_ps(dz, inv_dist);

        __m256 dt_x = _mm256_mul_ps(dt_unit_x, corr_magnitude);
        __m256 dt_y = _mm256_mul_ps(dt_unit_y, corr_magnitude);
        __m256 dt_z = _mm256_mul_ps(dt_unit_z, corr_magnitude);

        dt_x = _mm256_and_ps(dt_x, apply_corr_mask);
        dt_y = _mm256_and_ps(dt_y, apply_corr_mask);
        dt_z = _mm256_and_ps(dt_z, apply_corr_mask);

        _mm256_store_ps(&px_lcl[8], _mm256_fmadd_ps(dt_x, p0m, p0x));
        _mm256_store_ps(&py_lcl[8], _mm256_fmadd_ps(dt_y, p0m, p0y));
        _mm256_store_ps(&pz_lcl[8], _mm256_fmadd_ps(dt_z, p0m, p0z));

        _mm256_storeu_ps(&px_lcl[9], _mm256_fnmadd_ps(dt_x, p1m, p1x));
        _mm256_storeu_ps(&py_lcl[9], _mm256_fnmadd_ps(dt_y, p1m, p1y));
        _mm256_storeu_ps(&pz_lcl[9], _mm256_fnmadd_ps(dt_z, p1m, p1z));
      }
    }
    for (int i = 0; i < PTC_PER_CHN; i += 8) {
      _mm256_store_ps(&cns->ptc_pos_x[p_base + i], _mm256_load_ps(&px_lcl[i]));
      _mm256_store_ps(&cns->ptc_pos_y[p_base + i], _mm256_load_ps(&py_lcl[i]));
      _mm256_store_ps(&cns->ptc_pos_z[p_base + i], _mm256_load_ps(&pz_lcl[i]));
    }
  }
  // Step 4: Sphere Collisions with Friction
  for (int c = 0; c < MAX_CHNS; ++c) {
    int p_base = c * PTC_PER_CHN;
    int s_base = c * SPH_PER_CHN;
    struct chn_cfg *cfg = &cns->chn_cfg[c];
    __m256 fric_vec = _mm256_set1_ps(cfg->friction);

    for (int s = 0; s < SPH_PER_CHN; ++s) {
      __m256 sph_x_vec = _mm256_set1_ps(cns->sph_x[s_base + s]);
      __m256 sph_y_vec = _mm256_set1_ps(cns->sph_y[s_base + s]);
      __m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]);
      __m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]);

      for (int i = 0; i < PTC_PER_CHN; i += 8) {
        __m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + i]);
        __m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + i]);
        __m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + i]);
        
        __m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + i]);
        __m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + i]);
        __m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + i]);
        __m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]);
        ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);

        __m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec);
        __m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec);
        __m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec);

        __m256 dist_sq = _mm256_mul_ps(dx, dx);
        dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
        dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);

        __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
        __m256 dist = _mm256_rcp_ps(inv_dist);

        __m256 pen = _mm256_sub_ps(sph_r_vec, dist);
        __m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);

        __m256 norm_x = _mm256_mul_ps(dx, inv_dist);
        __m256 norm_y = _mm256_mul_ps(dy, inv_dist);
        __m256 norm_z = _mm256_mul_ps(dz, inv_dist);

        __m256 norm_corr_x = _mm256_mul_ps(norm_x, pen);
        __m256 norm_corr_y = _mm256_mul_ps(norm_y, pen);
        __m256 norm_corr_z = _mm256_mul_ps(norm_z, pen);

        __m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
        __m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
        __m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);

        __m256 vel_dot_norm = _mm256_mul_ps(vel_x, norm_x);
        vel_dot_norm = _mm256_fmadd_ps(vel_y, norm_y, vel_dot_norm);
        vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, vel_dot_norm);

        __m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x));
        __m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y));
        __m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z));

        __m256 tan_mag_sq = _mm256_mul_ps(tan_x, tan_x);
        tan_mag_sq = _mm256_fmadd_ps(tan_y, tan_y, tan_mag_sq);
        tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, tan_mag_sq);

        __m256 inv_tan_mag = _mm256_rsqrt_ps(_mm256_max_ps(tan_mag_sq, eps_vec));

        __m256 fric_mag = _mm256_mul_ps(pen, fric_vec);
        __m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag);
        __m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag);
        __m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag);

        __m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x);
        __m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y);
        __m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z);

        total_corr_x = _mm256_and_ps(total_corr_x, col_mask);
        total_corr_y = _mm256_and_ps(total_corr_y, col_mask);
        total_corr_z = _mm256_and_ps(total_corr_z, col_mask);

        total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass);
        total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass);
        total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass);

        __m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
        __m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
        __m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);

        _mm256_store_ps(&cns->ptc_pos_x[p_base + i], new_p_x);
        _mm256_store_ps(&cns->ptc_pos_y[p_base + i], new_p_y);
        _mm256_store_ps(&cns->ptc_pos_z[p_base + i], new_p_z);
      }
    }
  }
  // Step 5: Motion Constraints
  for (int c = 0; c < MAX_CHNS; ++c) {
    int p_base = c * PTC_PER_CHN;
    for (int i = 0; i < PTC_PER_CHN; i += 8) {
      __m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + i]);
      __m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + i]);
      __m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + i]);
      __m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]);
      pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min);

      __m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + i]);
      __m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + i]);
      __m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + i]);
      __m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + i]);

      __m256 dx = _mm256_sub_ps(px, rx);
      __m256 dy = _mm256_sub_ps(py, ry);
      __m256 dz = _mm256_sub_ps(pz, rz);

      __m256 dist_sq = _mm256_mul_ps(dx, dx);
      dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
      dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);

      __m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
      __m256 dist = _mm256_rcp_ps(inv_dist);

      __m256 pen = _mm256_sub_ps(dist, r_vec);
      __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);

      __m256 corr_factor = _mm256_mul_ps(pen, inv_dist);
      corr_factor = _mm256_and_ps(corr_factor, mask);

      __m256 dt_x = _mm256_mul_ps(dx, corr_factor);
      __m256 dt_y = _mm256_mul_ps(dy, corr_factor);
      __m256 dt_z = _mm256_mul_ps(dz, corr_factor);

      dt_x = _mm256_mul_ps(dt_x, pm);
      dt_y = _mm256_mul_ps(dt_y, pm);
      dt_z = _mm256_mul_ps(dt_z, pm);

      _mm256_store_ps(&cns->ptc_pos_x[p_base + i], _mm256_sub_ps(px, dt_x));
      _mm256_store_ps(&cns->ptc_pos_y[p_base + i], _mm256_sub_ps(py, dt_y));
      _mm256_store_ps(&cns->ptc_pos_z[p_base + i], _mm256_sub_ps(pz, dt_z));
    }
  }
 }
diff --git a/chain_ispc.ispc b/chain_ispc.ispc
 #define MAX_CHAINS 64
 #define PARTICLES_PER_CHAIN 16
 #define SPHERES_PER_CHAIN 8
 #define MAX_PARTICLES (MAX_CHAINS * PARTICLES_PER_CHAIN)
 #define MAX_SPHERES (MAX_CHAINS * SPHERES_PER_CHAIN)
 #define CONSTRAINTS_PER_CHAIN PARTICLES_PER_CHAIN
 #define MAX_REST_LENGTHS (MAX_CHAINS * CONSTRAINTS_PER_CHAIN)
 #define MAX_ITERATIONS 8 

 // Define chain configuration structure
 struct chain_cfg {
  uniform float damping;
  uniform float stiffness;
  uniform float stretch_factor;
  uniform float max_strain;
  uniform float friction;
  uniform float collision_radius;
  uniform float linear_inertia; // [0, 1]
  uniform float angular_inertia;     // [0, 1]
  uniform float centrifugal_inertia;// [0, 1]
 };

 // Define chain simulation structure
 struct chain_sim {
  uint8 free_idx_cnt;
  uint8 free_idx[MAX_CHAINS];
  uniform struct chain_cfg chain_configs[MAX_CHAINS];

  // Chain global forces (world space gravity and wind)
  uniform float gravity_x[MAX_CHAINS];
  uniform float gravity_y[MAX_CHAINS];
  uniform float gravity_z[MAX_CHAINS];

  uniform float wind_x[MAX_CHAINS];
  uniform float wind_y[MAX_CHAINS];
  uniform float wind_z[MAX_CHAINS];

  // Chain transformations (world space)
  uniform float chain_pos_x[MAX_CHAINS];
  uniform float chain_pos_y[MAX_CHAINS];
  uniform float chain_pos_z[MAX_CHAINS];

  uniform float chain_quat_x[MAX_CHAINS];
  uniform float chain_quat_y[MAX_CHAINS];
  uniform float chain_quat_z[MAX_CHAINS];
  uniform float chain_quat_w[MAX_CHAINS];

  uniform float prev_chain_pos_x[MAX_CHAINS];
  uniform float prev_chain_pos_y[MAX_CHAINS];
  uniform float prev_chain_pos_z[MAX_CHAINS];

  uniform float prev_chain_quat_x[MAX_CHAINS];
  uniform float prev_chain_quat_y[MAX_CHAINS];
  uniform float prev_chain_quat_z[MAX_CHAINS];
  uniform float prev_chain_quat_w[MAX_CHAINS];

  // Particle positions (model space)
  uniform float p_pos_x[MAX_PARTICLES];
  uniform float p_pos_y[MAX_PARTICLES];
  uniform float p_pos_z[MAX_PARTICLES];
  
  uniform float prev_p_pos_x[MAX_PARTICLES];
  uniform float prev_p_pos_y[MAX_PARTICLES];
  uniform float prev_p_pos_z[MAX_PARTICLES];

  uniform float inv_mass[MAX_PARTICLES];
  uniform float rest_lengths[MAX_REST_LENGTHS];

  // Sphere positions (model space)
  uniform float sphere_x[MAX_SPHERES];
  uniform float sphere_y[MAX_SPHERES];
  uniform float sphere_z[MAX_SPHERES];
  uniform float sphere_radius[MAX_SPHERES];

  // Rest positions (model space)
  uniform float rest_pos_x[MAX_PARTICLES];
  uniform float rest_pos_y[MAX_PARTICLES];
  uniform float rest_pos_z[MAX_PARTICLES];
  uniform float motion_radius[MAX_PARTICLES];
 };

 // ISPC task for chain simulation
 export void simulate_chains(
  __attribute__((aligned(32))) uniform float *uniform chain_quat_x_ptr,
  __attribute__((aligned(32))) uniform float *uniform chain_quat_y_ptr,
  __attribute__((aligned(32))) uniform float *uniform chain_quat_z_ptr,
  __attribute__((aligned(32))) uniform float *uniform chain_quat_w_ptr,
  __attribute__((aligned(32))) uniform float *uniform wind_x_ptr,
  __attribute__((aligned(32))) uniform float *uniform wind_y_ptr,
  __attribute__((aligned(32))) uniform float *uniform wind_z_ptr,
  __attribute__((aligned(32))) uniform float *uniform gravity_x_ptr,
  __attribute__((aligned(32))) uniform float *uniform gravity_y_ptr,
  __attribute__((aligned(32))) uniform float *uniform gravity_z_ptr,
  __attribute__((aligned(32))) uniform float *uniform chain_pos_x_ptr,
  __attribute__((aligned(32))) uniform float *uniform chain_pos_y_ptr,
  __attribute__((aligned(32))) uniform float *uniform chain_pos_z_ptr,
  __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_x_ptr,
  __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_y_ptr,
  __attribute__((aligned(32))) uniform float *uniform prev_chain_pos_z_ptr,
  __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_x_ptr,
  __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_y_ptr,
  __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_z_ptr,
  __attribute__((aligned(32))) uniform float *uniform prev_chain_quat_w_ptr,
  __attribute__((aligned(32))) uniform float *uniform p_pos_x_ptr,
  __attribute__((aligned(32))) uniform float *uniform p_pos_y_ptr,
  __attribute__((aligned(32))) uniform float *uniform p_pos_z_ptr,
  __attribute__((aligned(32))) uniform float *uniform inv_mass_ptr,
  __attribute__((aligned(32))) uniform float *uniform prev_p_pos_x_ptr,
  __attribute__((aligned(32))) uniform float *uniform prev_p_pos_y_ptr,
  __attribute__((aligned(32))) uniform float *uniform prev_p_pos_z_ptr,
  __attribute__((aligned(32))) uniform float *uniform rest_lengths_ptr,
  __attribute__((aligned(32))) uniform struct chain_cfg *uniform chain_configs_ptr, // Passed as pointer to array of structs
  __attribute__((aligned(32))) uniform float *uniform sphere_x_ptr,
  __attribute__((aligned(32))) uniform float *uniform sphere_y_ptr,
  __attribute__((aligned(32))) uniform float *uniform sphere_z_ptr,
  __attribute__((aligned(32))) uniform float *uniform sphere_radius_ptr,
  __attribute__((aligned(32))) uniform float *uniform rest_pos_x_ptr,
  __attribute__((aligned(32))) uniform float *uniform rest_pos_y_ptr,
  __attribute__((aligned(32))) uniform float *uniform rest_pos_z_ptr,
  __attribute__((aligned(32))) uniform float *uniform motion_radius_ptr,
  uniform float dt) {
  // Common SIMD constants (varying for vector operations across lanes)
  varying float dt_vec = dt;
  varying float dt2_vec = dt * dt;
  varying float one_vec = 1.0;
  varying float neg_one_vec = -1.0;
  varying float eps_vec = 1e-6;
  varying float inv_mass_clamp_min = 0.0;
  varying float inv_mass_clamp_max = 1e6;
  varying float zero_vec = 0.0;
  varying float inv_dt_vec = 1.0 / dt;
  varying float two_vec = 2.0;
  varying float inertia_clamp_min = 0.0;
  varying float inertia_clamp_max = 1.0;

  assume(MAX_CHAINS % programCount == 0);
  assume(PARTICLES_PER_CHAIN % programCount == 0);
  assume(SPHERES_PER_CHAIN % programCount == 0);
  assume(MAX_SPHERES % programCount == 0);
  assume(CONSTRAINTS_PER_CHAIN % programCount == 0);
  assume(MAX_REST_LENGTHS % programCount == 0);

  // Local arrays for PBD solver
  __attribute__((aligned(32))) uniform float px_local[PARTICLES_PER_CHAIN + 1];
  __attribute__((aligned(32))) uniform float py_local[PARTICLES_PER_CHAIN + 1];
  __attribute__((aligned(32))) uniform float pz_local[PARTICLES_PER_CHAIN + 1];
  __attribute__((aligned(32))) uniform float pm_local[PARTICLES_PER_CHAIN + 1];

  // All lanes can safely write to these uniform locations.
  px_local[PARTICLES_PER_CHAIN] = 0.0;
  py_local[PARTICLES_PER_CHAIN] = 0.0;
  pz_local[PARTICLES_PER_CHAIN] = 0.0;
  pm_local[PARTICLES_PER_CHAIN] = 0.0;
  
  // Stack arrays for precomputed data
  __attribute__((aligned(32))) uniform float wind_local_x[MAX_CHAINS];
  __attribute__((aligned(32))) uniform float wind_local_y[MAX_CHAINS];
  __attribute__((aligned(32))) uniform float wind_local_z[MAX_CHAINS];

  __attribute__((aligned(32))) uniform float gravity_local_x[MAX_CHAINS];
  __attribute__((aligned(32))) uniform float gravity_local_y[MAX_CHAINS];
  __attribute__((aligned(32))) uniform float gravity_local_z[MAX_CHAINS];

  __attribute__((aligned(32))) uniform float vel_x[MAX_CHAINS];
  __attribute__((aligned(32))) uniform float vel_y[MAX_CHAINS];
  __attribute__((aligned(32))) uniform float vel_z[MAX_CHAINS];

  __attribute__((aligned(32))) uniform float ang_vel_x[MAX_CHAINS];
  __attribute__((aligned(32))) uniform float ang_vel_y[MAX_CHAINS];
  __attribute__((aligned(32))) uniform float ang_vel_z[MAX_CHAINS];

  // Step 0: Precomputation (SIMD, process chains in groups of programCount)
  for (uniform int c = 0; c < MAX_CHAINS; c += programCount) {
    // Load chain quaternions
    varying float qx = chain_quat_x_ptr[c + programIndex];
    varying float qy = chain_quat_y_ptr[c + programIndex];
    varying float qz = chain_quat_z_ptr[c + programIndex];
    varying float qw = chain_quat_w_ptr[c + programIndex];

    // Compute local-space wind
    {
      varying float vx = wind_x_ptr[c + programIndex];
      varying float vy = wind_y_ptr[c + programIndex];
      varying float vz = wind_z_ptr[c + programIndex];

      // Create q_conjugate components
      varying float cqx = -qx;
      varying float cqy = -qy;
      varying float cqz = -qz;

      // Step 1: t = q_conj * v_world_as_quat
      varying float tx = qw * vx + cqy * vz - cqz * vy;
      varying float ty = qw * vy + cqz * vx - cqx * vz;
      varying float tz = qw * vz + cqx * vy - cqy * vx;
      varying float tw = qx * vx + qy * vy + qz * vz;

      // Step 2: result_vec = (t * q)_vec
      varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy;
      varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz;
      varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx;

      wind_local_x[c + programIndex] = result_x;
      wind_local_y[c + programIndex] = result_y;
      wind_local_z[c + programIndex] = result_z;
    }

    // Compute local-space gravity
    {
      varying float vx = gravity_x_ptr[c + programIndex];
      varying float float_vy = gravity_y_ptr[c + programIndex];
      varying float vz = gravity_z_ptr[c + programIndex];

      varying float cqx = -qx;
      varying float cqy = -qy;
      varying float cqz = -qz;

      varying float tx = qw * vx + cqy * vz - cqz * float_vy;
      varying float ty = qw * float_vy + cqz * vx - cqx * vz;
      varying float tz = qw * vz + cqx * float_vy - cqy * vx;
      varying float tw = qx * vx + qy * float_vy + qz * vz;

      // Corrected quaternion multiplication for gravity_local
      varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy;
      varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz;
      varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx;
      
      gravity_local_x[c + programIndex] = result_x;
      gravity_local_y[c + programIndex] = result_y;
      gravity_local_z[c + programIndex] = result_z;
    }

    // Compute linear velocity
    {
      varying float curr_x = chain_pos_x_ptr[c + programIndex];
      varying float curr_y = chain_pos_y_ptr[c + programIndex];
      varying float curr_z = chain_pos_z_ptr[c + programIndex];
      varying float prev_x = prev_chain_pos_x_ptr[c + programIndex];
      varying float prev_y = prev_chain_pos_y_ptr[c + programIndex];
      varying float prev_z = prev_chain_pos_z_ptr[c + programIndex];

      varying float vel_x_vec = (curr_x - prev_x) * inv_dt_vec;
      varying float vel_y_vec = (curr_y - prev_y) * inv_dt_vec;
      varying float vel_z_vec = (curr_z - prev_z) * inv_dt_vec;
      vel_x[c + programIndex] = vel_x_vec;
      vel_y[c + programIndex] = vel_y_vec;
      vel_z[c + programIndex] = vel_z_vec;
    }

    // Compute angular velocity
    {
      varying float prev_qx = prev_chain_quat_x_ptr[c + programIndex];
      varying float prev_qy = prev_chain_quat_y_ptr[c + programIndex];
      varying float prev_qz = prev_chain_quat_z_ptr[c + programIndex];
      varying float prev_qw = prev_chain_quat_w_ptr[c + programIndex];

      // Step 1: Compute delta quaternion (to - from) - Note: This is an approximation
      varying float delta_qx = qx - prev_qx;
      varying float delta_qy = qy - prev_qy;
      varying float delta_qz = qz - prev_qz;
      varying float delta_qw = qw - prev_qw;

      // Step 2: Compute conjugate of previous quaternion
      varying float conj_qx = -prev_qx;
      varying float conj_qy = -prev_qy;
      varying float conj_qz = -prev_qz;
      varying float conj_qw = prev_qw;
      // Step 3: Compute orientation delta = deltaQuat * conj(prev_quat)
      varying float orient_qw = delta_qw * conj_qw - delta_qx * conj_qx - delta_qy * conj_qy - delta_qz * conj_qz;
      varying float cross_x = delta_qy * conj_qz - delta_qz * conj_qy;
      varying float cross_y = delta_qz * conj_qx - delta_qx * conj_qz;
      varying float cross_z = delta_qx * conj_qy - delta_qy * conj_qx;
      varying float orient_qx = delta_qw * conj_qx + delta_qx * conj_qw + cross_x;
      varying float orient_qy = delta_qw * conj_qy + delta_qy * conj_qw + cross_y;
      varying float orient_qz = delta_qw * conj_qz + delta_qz * conj_qw + cross_z;
      // Step 4: Compute dot product (to, from) for shortest arc check
      varying float dot = qw * prev_qw + qx * prev_qx + qy * prev_qy + qz * prev_qz;
      // Step 5: Negate orientation delta if dot < 0
      varying float negate_sign = select(dot < zero_vec, neg_one_vec, one_vec);
      orient_qx *= negate_sign;
      orient_qy *= negate_sign;
      orient_qz *= negate_sign;

      // Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt))
      varying float scale = one_vec / (two_vec * dt_vec);
      varying float ang_vel_x_vec = orient_qx * scale;
      varying float ang_vel_y_vec = orient_qy * scale;
      varying float ang_vel_z_vec = orient_qz * scale;

      ang_vel_x[c + programIndex] = ang_vel_x_vec;
      ang_vel_y[c + programIndex] = ang_vel_y_vec;
      ang_vel_z[c + programIndex] = ang_vel_z_vec;
    }
  }

  // Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
  for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
    uniform int base_idx = c * PARTICLES_PER_CHAIN;
    uniform struct chain_cfg *cfg = &chain_configs_ptr[c];

    // Load velocities and inertia (scalar, broadcast to lanes)
    varying float vel_x_vec = vel_x[c];
    varying float vel_y_vec = vel_y[c];
    varying float vel_z_vec = vel_z[c];

    varying float ang_vel_x_vec = ang_vel_x[c];
    varying float ang_vel_y_vec = ang_vel_y[c];
    varying float ang_vel_z_vec = ang_vel_z[c];

    varying float linear_inertia_vec = cfg->linear_inertia;
    varying float angular_inertia_vec = cfg->angular_inertia;
    varying float centrifugal_inertia_vec = cfg->centrifugal_inertia;
    
    // Clamp inertia parameters to [0, 1]
    linear_inertia_vec = clamp(linear_inertia_vec, inertia_clamp_min, inertia_clamp_max);
    angular_inertia_vec = clamp(angular_inertia_vec, inertia_clamp_min, inertia_clamp_max);
    centrifugal_inertia_vec = clamp(centrifugal_inertia_vec, inertia_clamp_min, inertia_clamp_max);
    
    // Process particles in groups
    for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
      varying float px = p_pos_x_ptr[base_idx + i + programIndex];
      varying float py = p_pos_y_ptr[base_idx + i + programIndex];
      varying float pz = p_pos_z_ptr[base_idx + i + programIndex];
      varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex];
      p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max);

      // Linear inertia: v * dt * linear_inertia
      varying float lin_delta_x = vel_x_vec * dt_vec * linear_inertia_vec;
      varying float lin_delta_y = vel_y_vec * dt_vec * linear_inertia_vec;
      varying float lin_delta_z = vel_z_vec * dt_vec * linear_inertia_vec;

      // Angular inertia: (ω × r) * dt * angular_inertia
      varying float ang_delta_x = ang_vel_y_vec * pz - ang_vel_z_vec * py;
      varying float ang_delta_y = ang_vel_z_vec * px - ang_vel_x_vec * pz;
      varying float ang_delta_z = ang_vel_x_vec * py - ang_vel_y_vec * px;

      ang_delta_x *= dt_vec * angular_inertia_vec;
      ang_delta_y *= dt_vec * angular_inertia_vec;
      ang_delta_z *= dt_vec * angular_inertia_vec;
      
      // Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
      varying float cross1_x = ang_vel_y_vec * pz - ang_vel_z_vec * py;
      varying float cross1_y = ang_vel_z_vec * px - ang_vel_x_vec * pz;
      varying float cross1_z = ang_vel_x_vec * py - ang_vel_y_vec * px;
      varying float cross2_x = ang_vel_y_vec * cross1_z - ang_vel_z_vec * cross1_y;
      varying float cross2_y = ang_vel_z_vec * cross1_x - ang_vel_x_vec * cross1_z;
      varying float cross2_z = ang_vel_x_vec * cross1_y - ang_vel_y_vec * cross1_x;

      varying float cent_delta_x = cross2_x * dt2_vec * centrifugal_inertia_vec;
      varying float cent_delta_y = cross2_y * dt2_vec * centrifugal_inertia_vec;
      varying float cent_delta_z = cross2_z * dt2_vec * centrifugal_inertia_vec;
      
      // Total delta
      varying float delta_x = lin_delta_x + ang_delta_x + cent_delta_x;
      varying float delta_y = lin_delta_y + ang_delta_y + cent_delta_y;
      varying float delta_z = lin_delta_z + ang_delta_z + cent_delta_z;
      
      // Update positions
      p_pos_x_ptr[base_idx + i + programIndex] = px + delta_x;
      p_pos_y_ptr[base_idx + i + programIndex] = py + delta_y;
      p_pos_z_ptr[base_idx + i + programIndex] = pz + delta_z;
    }

    // Update previous transformation (scalar)
    prev_chain_pos_x_ptr[c] = chain_pos_x_ptr[c];
    prev_chain_pos_y_ptr[c] = chain_pos_y_ptr[c];
    prev_chain_pos_z_ptr[c] = chain_pos_z_ptr[c];

    prev_chain_quat_x_ptr[c] = chain_quat_x_ptr[c];
    prev_chain_quat_y_ptr[c] = chain_quat_y_ptr[c];
    prev_chain_quat_z_ptr[c] = chain_quat_z_ptr[c];
    prev_chain_quat_w_ptr[c] = chain_quat_w_ptr[c];
  }

  // Step 2: Verlet Integration
  for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
    uniform int base_idx = c * PARTICLES_PER_CHAIN;
    uniform struct chain_cfg *cfg = &chain_configs_ptr[c];

    // Load forces and damping (scalar, broadcast to lanes)
    varying float gravity_x_vec = gravity_local_x[c];
    varying float gravity_y_vec = gravity_local_y[c];
    varying float gravity_z_vec = gravity_local_z[c];

    varying float wind_x_vec = wind_local_x[c];
    varying float wind_y_vec = wind_local_y[c];
    varying float wind_z_vec = wind_local_z[c];

    varying float damping_vec = cfg->damping;
    for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
      varying float p_curr_x = p_pos_x_ptr[base_idx + i + programIndex];
      varying float p_curr_y = p_pos_y_ptr[base_idx + i + programIndex];
      varying float p_curr_z = p_pos_z_ptr[base_idx + i + programIndex];

      varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex];
      p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max);

      varying float p_prev_x = prev_p_pos_x_ptr[base_idx + i + programIndex];
      varying float p_prev_y = prev_p_pos_y_ptr[base_idx + i + programIndex];
      varying float p_prev_z = prev_p_pos_z_ptr[base_idx + i + programIndex];

      // Compute forces and accelerations
      varying float force_x = gravity_x_vec + wind_x_vec;
      varying float force_y = gravity_y_vec + wind_y_vec;
      varying float force_z = gravity_z_vec + wind_z_vec;

      varying float acc_x = force_x * p_inv_mass;
      varying float acc_y = force_y * p_inv_mass;
      varying float acc_z = force_z * p_inv_mass;

      // Compute velocity with damping
      varying float v_x = p_curr_x - p_prev_x;
      varying float v_y = p_curr_y - p_prev_y;
      varying float v_z = p_curr_z - p_prev_z;

      varying float damped_vel_x = v_x * damping_vec;
      varying float damped_vel_y = v_y * damping_vec;
      varying float damped_vel_z = v_z * damping_vec;

      // Update positions: p_curr + damped_vel + acc * dt^2
      varying float term_accel_x = acc_x * dt2_vec;
      varying float term_accel_y = acc_y * dt2_vec;
      varying float term_accel_z = acc_z * dt2_vec;

      p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x + damped_vel_x + term_accel_x;
      p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y + damped_vel_y + term_accel_y;
      p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z + damped_vel_z + term_accel_z;
      
      // Update previous positions
      prev_p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x;
      prev_p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y;
      prev_p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z;
    }
  }

  // Step 3: Distance Constraints
  for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
    uniform int p_base = c * PARTICLES_PER_CHAIN;
    uniform int r_base = c * CONSTRAINTS_PER_CHAIN;
    uniform struct chain_cfg *cfg = &chain_configs_ptr[c];

    varying float stiffness_vec = cfg->stiffness;
    varying float stretch_factor_vec = cfg->stretch_factor;
    varying float max_strain_vec_abs = cfg->max_strain;
    varying float neg_max_strain_vec_abs = max_strain_vec_abs * neg_one_vec;
    
    // Copy particles to local arrays with uniform indexing
    // This loop now performs a vectorized load into the uniform px_local array.
    for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
      // Load a vector of particles from global memory into varying registers
      varying float current_px = p_pos_x_ptr[p_base + i + programIndex];
      varying float current_py = p_pos_y_ptr[p_base + i + programIndex];
      varying float current_pz = p_pos_z_ptr[p_base + i + programIndex];
      varying float current_pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);

      // Store these varying values into the uniform local array
      // The compiler can now see this as a series of vector stores
      // because px_local is uniform.
      foreach_active (idx) {
        px_local[i + programIndex] = current_px;
        py_local[i + programIndex] = current_py;
        pz_local[i + programIndex] = current_pz;
        pm_local[i + programIndex] = current_pm;
      }
    }

    // Iterate over constraints
    for (uniform int iter = 0; iter < MAX_ITERATIONS; iter++) {
      // Process constraints in groups of programCount (0 to PARTICLES_PER_CHAIN-2)
      for (uniform int i = 0; i < PARTICLES_PER_CHAIN - 1; i += programCount) {
        // Now, load directly from uniform px_local using varying programIndex
        // The compiler can optimize these into vector loads.
        varying float p0x = px_local[i + programIndex];
        varying float p0y = py_local[i + programIndex];
        varying float p0z = pz_local[i + programIndex]; 
        varying float p0m = pm_local[i + programIndex];

        // For p1, we need to be careful with the indexing relative to programIndex.
        // The loop condition 'i < PARTICLES_PER_CHAIN - 1' ensures that 'i + programIndex + 1'
        // will not exceed 'PARTICLES_PER_CHAIN - 1' for any active lane.
        varying float p1x = px_local[i + programIndex + 1]; 
        varying float p1y = py_local[i + programIndex + 1];
        varying float p1z = pz_local[i + programIndex + 1];
        varying float p1m = pm_local[i + programIndex + 1];

        varying float rest_len_vec = rest_lengths_ptr[r_base + i + programIndex];
        
        varying float dx = p1x - p0x;
        varying float dy = p1y - p0y;
        varying float dz = p1z - p0z;

        varying float dist_sq = dx * dx + dy * dy + dz * dz;
        varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
        varying float dist = rcp(inv_dist);
        varying float w_sum = p0m + p1m;
        varying float rcp_w_sum = rcp(max(w_sum, eps_vec));
        varying float diff = dist - rest_len_vec;
        varying float strain = diff * rcp(max(rest_len_vec, eps_vec));
        varying float strain_clamped = clamp(strain, neg_max_strain_vec_abs, max_strain_vec_abs);
        varying float dynamic_stiffness = stiffness_vec * rcp(one_vec + abs(strain_clamped));
        varying float corr_magnitude = diff * dynamic_stiffness * stretch_factor_vec * rcp_w_sum;
        varying float apply_corr_mask = select(dist_sq > eps_vec, 1.0, 0.0);
        // Mask lanes where i + programIndex >= PARTICLES_PER_CHAIN - 1 (no constraint for last particle)
        apply_corr_mask = select(i + programIndex < PARTICLES_PER_CHAIN - 1, apply_corr_mask, 0.0);
        
        varying float delta_x = (dx * inv_dist) * corr_magnitude * apply_corr_mask;
        varying float delta_y = (dy * inv_dist) * corr_magnitude * apply_corr_mask;
        varying float delta_z = (dz * inv_dist) * corr_magnitude * apply_corr_mask;

        // Apply corrections, writing back to uniform px_local
        // These are now vector stores.
        px_local[i + programIndex] -= delta_x * p0m;
        py_local[i + programIndex] -= delta_y * p0m;
        pz_local[i + programIndex] -= delta_z * p0m;

        px_local[i + programIndex + 1] += delta_x * p1m;
        py_local[i + programIndex + 1] += delta_y * p1m;
        pz_local[i + programIndex + 1] += delta_z * p1m;
      }
    }
    // Write back to global arrays
    for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
      // Load a vector of particles from uniform local array into varying registers
      varying float updated_px = px_local[i + programIndex];
      varying float updated_py = py_local[i + programIndex];
      varying float updated_pz = pz_local[i + programIndex];

      // Store these varying values into the global memory
      foreach_active (idx) {
        p_pos_x_ptr[p_base + i + programIndex] = updated_px;
        p_pos_y_ptr[p_base + i + programIndex] = updated_py;
        p_pos_z_ptr[p_base + i + programIndex] = updated_pz;
      }
    }
  }
  // Step 4: Sphere Collisions with Friction
  for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
    uniform int p_base = c * PARTICLES_PER_CHAIN;
    uniform int s_base = c * SPHERES_PER_CHAIN;
    uniform struct chain_cfg *cfg = &chain_configs_ptr[c];
    varying float friction_vec = cfg->friction;
    for (uniform int s = 0; s < SPHERES_PER_CHAIN; s += 1) {
      varying float sphere_x_vec = sphere_x_ptr[s_base + s];
      varying float sphere_y_vec = sphere_y_ptr[s_base + s];
      varying float sphere_z_vec = sphere_z_ptr[s_base + s];
      varying float sphere_r_vec = sphere_radius_ptr[s_base + s];

      for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
        varying float p_curr_x = p_pos_x_ptr[p_base + i + programIndex];
        varying float p_curr_y = p_pos_y_ptr[p_base + i + programIndex];
        varying float p_curr_z = p_pos_z_ptr[p_base + i + programIndex];

        varying float p_prev_x = prev_p_pos_x_ptr[p_base + i + programIndex];
        varying float p_prev_y = prev_p_pos_y_ptr[p_base + i + programIndex];
        varying float p_prev_z = prev_p_pos_z_ptr[p_base + i + programIndex];
        varying float p_inv_mass = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);

        varying float dx = p_curr_x - sphere_x_vec;
        varying float dy = p_curr_y - sphere_y_vec;
        varying float dz = p_curr_z - sphere_z_vec;

        varying float dist_sq = dx * dx + dy * dy + dz * dz;
        varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
        varying float dist = rcp(inv_dist);

        varying float penetration = sphere_r_vec - dist;
        varying float collision_mask = select(penetration > zero_vec, 1.0, 0.0);

        varying float normal_x = dx * inv_dist;
        varying float normal_y = dy * inv_dist;
        varying float normal_z = dz * inv_dist;

        varying float normal_corr_x = normal_x * penetration;
        varying float normal_corr_y = normal_y * penetration;
        varying float normal_corr_z = normal_z * penetration;

        varying float v_x = p_curr_x - p_prev_x;
        varying float v_y = p_curr_y - p_prev_y;
        varying float v_z = p_curr_z - p_prev_z;
        
        varying float vel_dot_normal = v_x * normal_x + v_y * normal_y + v_z * normal_z;
        varying float tangent_x = v_x - vel_dot_normal * normal_x;
        varying float tangent_y = v_y - vel_dot_normal * normal_y;
        varying float tangent_z = v_z - vel_dot_normal * normal_z;

        varying float tangent_mag_sq = tangent_x * tangent_x + tangent_y * tangent_y + tangent_z * tangent_z;
        varying float inv_tangent_mag = rsqrt(max(tangent_mag_sq, eps_vec));

        varying float friction_mag = penetration * friction_vec;
        varying float friction_x = (tangent_x * inv_tangent_mag) * friction_mag;
        varying float friction_y = (tangent_y * inv_tangent_mag) * friction_mag;
        varying float friction_z = (tangent_z * inv_tangent_mag) * friction_mag;

        varying float total_corr_x = (normal_corr_x - friction_x) * collision_mask * p_inv_mass;
        varying float total_corr_y = (normal_corr_y - friction_y) * collision_mask * p_inv_mass;
        varying float total_corr_z = (normal_corr_z - friction_z) * collision_mask * p_inv_mass;

        p_pos_x_ptr[p_base + i + programIndex] = p_curr_x + total_corr_x;
        p_pos_y_ptr[p_base + i + programIndex] = p_curr_y + total_corr_y;
        p_pos_z_ptr[p_base + i + programIndex] = p_curr_z + total_corr_z;
      }
    }
  }
  // Step 5: Motion Constraints
  for (uniform int c = 0; c < MAX_CHAINS; c += 1) {
    uniform int p_base = c * PARTICLES_PER_CHAIN;
    for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) {
      varying float px = p_pos_x_ptr[p_base + i + programIndex];
      varying float py = p_pos_y_ptr[p_base + i + programIndex];
      varying float pz = p_pos_z_ptr[p_base + i + programIndex];
      varying float pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max);

      varying float rx = rest_pos_x_ptr[p_base + i + programIndex];
      varying float ry = rest_pos_y_ptr[p_base + i + programIndex];
      varying float rz = rest_pos_z_ptr[p_base + i + programIndex];
      varying float radius_vec = motion_radius_ptr[p_base + i + programIndex];

      varying float dx = px - rx;
      varying float dy = py - ry;
      varying float dz = pz - rz;
      varying float dist_sq = dx * dx + dy * dy + dz * dz;
      varying float inv_dist = rsqrt(max(dist_sq, eps_vec));
      varying float dist = rcp(inv_dist);

      varying float penetration = dist - radius_vec;
      varying float mask = select(penetration > zero_vec, 1.0, 0.0);

      varying float corr_factor = penetration * inv_dist * mask;
      varying float delta_x = dx * corr_factor * pm;
      varying float delta_y = dy * corr_factor * pm;
      varying float delta_z = dz * corr_factor * pm;

      p_pos_x_ptr[p_base + i + programIndex] = px - delta_x;
      p_pos_y_ptr[p_base + i + programIndex] = py - delta_y;
      p_pos_z_ptr[p_base + i + programIndex] = pz - delta_z;
    }
  }
 }