Last active
July 12, 2025 11:08
-
-
Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.
chain simulation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <arm_neon.h> | |
#include <math.h> | |
#include <assert.h> | |
#include <string.h> | |
#include <float.h> | |
enum { | |
MAX_CHNS = 64, | |
PTC_PER_CHN = 16, | |
SPH_PER_CHN = 8, | |
MAX_PTC = (MAX_CHNS * PTC_PER_CHN), | |
MAX_SPH = (MAX_CHNS * SPH_PER_CHN), | |
CON_PER_CHN = PTC_PER_CHN, | |
MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN), | |
MAX_ITR = 16, | |
}; | |
struct chn_cfg { | |
float damping; // [0, 1] | |
float stiffness; // [0, 1] | |
float stretch_factor; | |
float max_strain; | |
float friction; // [0, 1] | |
float linear_inertia; // [0, 1], 1.0f = physically correct linear motion | |
float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation | |
float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force | |
}; | |
struct chn_sim { | |
unsigned char free_idx_cnt; | |
unsigned char free_idx[MAX_CHNS]; | |
struct chn_cfg chn_cfg[MAX_CHNS] __attribute__((aligned(32))); | |
// chain global forces (world space gravity and wind) | |
float chn_grav_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_grav_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_grav_z[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_wnd_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_wnd_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_wnd_z[MAX_CHNS] __attribute__((aligned(32))); | |
// chain transformations (world space) | |
float chn_pos_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_pos_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_pos_z[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_quat_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_quat_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_quat_z[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_quat_w[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32))); | |
// particle positions (model space) | |
float ptc_pos_x[MAX_PTC] __attribute__((aligned(32))); | |
float ptc_pos_y[MAX_PTC] __attribute__((aligned(32))); | |
float ptc_pos_z[MAX_PTC] __attribute__((aligned(32))); | |
float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32))); | |
float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32))); | |
float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32))); | |
float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32))); | |
float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32))); | |
// sphere positions (model space) | |
float sph_x[MAX_SPH] __attribute__((aligned(32))); | |
float sph_y[MAX_SPH] __attribute__((aligned(32))); | |
float sph_z[MAX_SPH] __attribute__((aligned(32))); | |
float sph_r[MAX_SPH] __attribute__((aligned(32))); | |
// rest positions (model space) | |
float rest_pos_x[MAX_PTC] __attribute__((aligned(32))); | |
float rest_pos_y[MAX_PTC] __attribute__((aligned(32))); | |
float rest_pos_z[MAX_PTC] __attribute__((aligned(32))); | |
float motion_radius[MAX_PTC] __attribute__((aligned(32))); | |
}; | |
extern void | |
chn_sim_init(struct chn_sim *cns) { | |
assert(cns); | |
cns->free_idx_cnt = MAX_CHNS; | |
for (int i = 0; i < MAX_CHNS; ++i) { | |
cns->free_idx[i] = MAX_CHNS - i - 1; | |
} | |
for (int i = 0; i < MAX_CHNS; i += 4) { | |
vst1q_f32(&cns->chn_quat_w[i], vdupq_n_f32(1.0f)); | |
vst1q_f32(&cns->chn_prev_quat_w[i], vdupq_n_f32(1.0f)); | |
} | |
} | |
extern int | |
chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) { | |
assert(cns); | |
assert(cfg); | |
assert(cns->free_idx_cnt > 0); | |
assert(cnt <= PTC_PER_CHN); | |
assert(cnt > 1); | |
int chn = cns->free_idx[--cns->free_idx_cnt]; | |
cns->chn_cfg[chn] = *cfg; | |
int p_idx = (chn * PTC_PER_CHN); | |
int r_idx = (chn * CON_PER_CHN); | |
for (int i = 0; i < cnt; ++i) { | |
cns->ptc_pos_x[p_idx + i] = rest_pos[i*4+0]; | |
cns->ptc_pos_y[p_idx + i] = rest_pos[i*4+1]; | |
cns->ptc_pos_z[p_idx + i] = rest_pos[i*4+2]; | |
cns->ptc_inv_mass[p_idx + i] = rest_pos[i*4+3]; | |
cns->rest_pos_x[p_idx + i] = rest_pos[i*4+0]; | |
cns->rest_pos_y[p_idx + i] = rest_pos[i*4+1]; | |
cns->rest_pos_z[p_idx + i] = rest_pos[i*4+2]; | |
} | |
for (int i = cnt; i < PTC_PER_CHN; ++i) { | |
cns->ptc_pos_x[p_idx + i] = 0.0f; | |
cns->ptc_pos_y[p_idx + i] = 0.0f; | |
cns->ptc_pos_z[p_idx + i] = 0.0f; | |
} | |
for (int i = 1; i < cnt; ++i) { | |
float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0]; | |
float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1]; | |
float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2]; | |
cns->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz); | |
} | |
for (int i = cnt; i < CON_PER_CHN; ++i) { | |
cns->ptc_rest_len[r_idx + i] = 0.0f; | |
cns->ptc_rest_len[r_idx + i] = 0.0f; | |
cns->ptc_rest_len[r_idx + i] = 0.0f; | |
} | |
return chn; | |
} | |
extern void | |
chn_sim_del(struct chn_sim *cns, int chn) { | |
assert(cns); | |
assert(chn >= 0); | |
assert(chn < MAX_CHNS); | |
int p_idx = (chn * PTC_PER_CHN); | |
int s_idx = (chn * SPH_PER_CHN); | |
int c_idx = (chn * CON_PER_CHN); | |
cns->free_idx[cns->free_idx_cnt++] = chn; | |
memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0])); | |
cns->chn_grav_x[chn] = 0.0f; | |
cns->chn_grav_y[chn] = 0.0f; | |
cns->chn_grav_z[chn] = 0.0f; | |
cns->chn_wnd_x[chn] = 0.0f; | |
cns->chn_wnd_y[chn] = 0.0f; | |
cns->chn_wnd_z[chn] = 0.0f; | |
cns->chn_pos_x[chn] = 0.0f; | |
cns->chn_pos_y[chn] = 0.0f; | |
cns->chn_pos_z[chn] = 0.0f; | |
cns->chn_quat_x[chn] = 0.0f; | |
cns->chn_quat_y[chn] = 0.0f; | |
cns->chn_quat_z[chn] = 0.0f; | |
cns->chn_quat_w[chn] = 1.0f; | |
cns->chn_prev_pos_x[chn] = 0.0f; | |
cns->chn_prev_pos_y[chn] = 0.0f; | |
cns->chn_prev_pos_z[chn] = 0.0f; | |
cns->chn_prev_quat_x[chn] = 0.0f; | |
cns->chn_prev_quat_y[chn] = 0.0f; | |
cns->chn_prev_quat_z[chn] = 0.0f; | |
cns->chn_prev_quat_w[chn] = 1.0f; | |
for (int i = 0; i < PTC_PER_CHN; i += 4) { | |
vst1q_f32(&cns->ptc_pos_x[p_idx + i], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->ptc_pos_y[p_idx + i], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->ptc_pos_z[p_idx + i], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->ptc_inv_mass[p_idx + i], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->ptc_prev_pos_x[p_idx + i], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->ptc_prev_pos_y[p_idx + i], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->ptc_prev_pos_z[p_idx + i], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->rest_pos_x[p_idx + i], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->rest_pos_y[p_idx + i], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->rest_pos_z[p_idx + i], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->motion_radius[p_idx + i], vdupq_n_f32(0.0f)); | |
} | |
// Spheres | |
vst1q_f32(&cns->sph_x[s_idx], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->sph_y[s_idx], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->sph_z[s_idx], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->sph_r[s_idx], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->sph_x[s_idx + 4], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->sph_y[s_idx + 4], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->sph_z[s_idx + 4], vdupq_n_f32(0.0f)); | |
vst1q_f32(&cns->sph_r[s_idx + 4], vdupq_n_f32(0.0f)); | |
// Loop by 4 for NEON | |
for (int i = 0; i < CON_PER_CHN; i += 4) { | |
vst1q_f32(&cns->ptc_rest_len[c_idx + i], vdupq_n_f32(0.0f)); | |
} | |
} | |
extern void | |
chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { | |
assert(cns); | |
assert(pos3); | |
assert(rot4); | |
assert(chn >= 0); | |
assert(chn < MAX_CHNS); | |
cns->chn_pos_x[chn] = pos3[0]; | |
cns->chn_pos_y[chn] = pos3[1]; | |
cns->chn_pos_z[chn] = pos3[2]; | |
cns->chn_quat_x[chn] = rot4[0]; | |
cns->chn_quat_y[chn] = rot4[1]; | |
cns->chn_quat_z[chn] = rot4[2]; | |
cns->chn_quat_w[chn] = rot4[3]; | |
} | |
extern void | |
chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { | |
assert(cns); | |
assert(pos3); | |
assert(rot4); | |
assert(chn >= 0); | |
assert(chn < MAX_CHNS); | |
cns->chn_pos_x[chn] = cns->chn_prev_pos_x[chn] = pos3[0]; | |
cns->chn_pos_y[chn] = cns->chn_prev_pos_y[chn] = pos3[1]; | |
cns->chn_pos_z[chn] = cns->chn_prev_pos_z[chn] = pos3[2]; | |
cns->chn_quat_x[chn] = cns->chn_prev_quat_x[chn] = rot4[0]; | |
cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1]; | |
cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2]; | |
cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3]; | |
} | |
extern void | |
chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) { | |
assert(cns); | |
assert(grav3); | |
assert(chn >= 0); | |
assert(chn < MAX_CHNS); | |
cns->chn_grav_x[chn] = grav3[0]; | |
cns->chn_grav_y[chn] = grav3[1]; | |
cns->chn_grav_z[chn] = grav3[2]; | |
} | |
extern void | |
chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) { | |
assert(cns); | |
assert(wind3); | |
assert(chn >= 0); | |
assert(chn < MAX_CHNS); | |
cns->chn_wnd_x[chn] = wind3[0]; | |
cns->chn_wnd_y[chn] = wind3[1]; | |
cns->chn_wnd_z[chn] = wind3[2]; | |
} | |
extern void | |
chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) { | |
assert(cns); | |
assert(spheres); | |
assert(cnt < MAX_SPH); | |
int s_idx = (chn * SPH_PER_CHN); | |
for (int i = 0; i < cnt; i++) { | |
cns->sph_x[s_idx + i] = spheres[i*4+0]; | |
cns->sph_y[s_idx + i] = spheres[i*4+1]; | |
cns->sph_z[s_idx + i] = spheres[i*4+2]; | |
cns->sph_r[s_idx + i] = spheres[i*4+3]; | |
} | |
for (int i = cnt; i < SPH_PER_CHN; ++i) { | |
cns->sph_x[s_idx + i] = 0; | |
cns->sph_y[s_idx + i] = 0; | |
cns->sph_z[s_idx + i] = 0; | |
cns->sph_r[s_idx + i] = 0; | |
} | |
} | |
extern void | |
chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) { | |
assert(cns); | |
assert(chn >= 0); | |
assert(chn < MAX_CHNS); | |
assert(cnt >= 0); | |
assert(cnt < PTC_PER_CHN); | |
int p_idx = (chn * PTC_PER_CHN); | |
for (int i = 0; i < cnt; ++i) { | |
cns->motion_radius[p_idx + i] = radius[i]; | |
} | |
for (int i = cnt; i < PTC_PER_CHN; ++i) { | |
cns->motion_radius[p_idx + i] = FLT_MAX; | |
} | |
} | |
extern void | |
chn_sim_run(struct chn_sim *cns, float dt) { | |
// NEON constants | |
const float32x4_t dt_vec = vdupq_n_f32(dt); | |
const float32x4_t dt2_vec = vdupq_n_f32(dt * dt); | |
const float32x4_t one_vec = vdupq_n_f32(1.0f); | |
const float32x4_t neg_one_vec = vdupq_n_f32(-1.0f); | |
const float32x4_t eps_vec = vdupq_n_f32(1e-6f); | |
const float32x4_t ptc_inv_mass_clamp_min = vdupq_n_f32(0.0f); | |
const float32x4_t ptc_inv_mass_clamp_max = vdupq_n_f32(1e6f); | |
const float32x4_t zero_vec = vdupq_n_f32(0.0f); | |
const float32x4_t inv_dt_vec = vdupq_n_f32(1.0f / dt); | |
const float32x4_t two_vec = vdupq_n_f32(2.0f); | |
const float32x4_t inertia_clamp_min = vdupq_n_f32(0.0f); | |
const float32x4_t inertia_clamp_max = vdupq_n_f32(1.0f); | |
// NEON doesn't have a direct equivalent to _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)) | |
// for absolute mask. We'll use `vabsq_f32` where needed or reconstruct the logic. | |
// Local arrays for PBD solver | |
float px_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); | |
float py_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); | |
float pz_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); | |
float pm_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); | |
// Initialize dummy elements | |
px_lcl[PTC_PER_CHN] = 0.0f; | |
py_lcl[PTC_PER_CHN] = 0.0f; | |
pz_lcl[PTC_PER_CHN] = 0.0f; | |
pm_lcl[PTC_PER_CHN] = 0.0f; | |
// Stack arrays for precomputed data | |
float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_wnd_lcl_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_wnd_lcl_z[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_grav_lcl_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_grav_lcl_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_grav_lcl_z[MAX_CHNS] __attribute__((aligned(32))); | |
float vel_x[MAX_CHNS] __attribute__((aligned(32))); | |
float vel_y[MAX_CHNS] __attribute__((aligned(32))); | |
float vel_z[MAX_CHNS] __attribute__((aligned(32))); | |
float ang_vel_x[MAX_CHNS] __attribute__((aligned(32))); | |
float ang_vel_y[MAX_CHNS] __attribute__((aligned(32))); | |
float ang_vel_z[MAX_CHNS] __attribute__((aligned(32))); | |
// Step 0: Precomputation | |
for (int c = 0; c < MAX_CHNS; c += 4) { | |
// Load chain quaternions | |
float32x4_t qx = vld1q_f32(&cns->chn_quat_x[c]); | |
float32x4_t qy = vld1q_f32(&cns->chn_quat_y[c]); | |
float32x4_t qz = vld1q_f32(&cns->chn_quat_z[c]); | |
float32x4_t qw = vld1q_f32(&cns->chn_quat_w[c]); | |
// Create q_conjugate components (qw remains, qx, qy, qz are negated) | |
float32x4_t cqx = vnegq_f32(qx); // -qx | |
float32x4_t cqy = vnegq_f32(qy); // -qy | |
float32x4_t cqz = vnegq_f32(qz); // -qz | |
// Compute local-space wind | |
{ | |
float32x4_t vx = vld1q_f32(&cns->chn_wnd_x[c]); | |
float32x4_t vy = vld1q_f32(&cns->chn_wnd_y[c]); | |
float32x4_t vz = vld1q_f32(&cns->chn_wnd_z[c]); | |
// Step 1: t = q_conj * v_world_as_quat | |
// t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) | |
// tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
// ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
// tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
float32x4_t tx = vmlaq_f32(vmulq_f32(qw, vx), cqy, vz); // qw*vx + (-qy)*vz | |
tx = vmlsq_f32(tx, cqz, vy); // qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
float32x4_t ty = vmlaq_f32(vmulq_f32(qw, vy), cqz, vx); // qw*vy + (-qz)*vx | |
ty = vmlsq_f32(ty, cqx, vz); // qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
float32x4_t tz = vmlaq_f32(vmulq_f32(qw, vz), cqx, vy); // qw*vz + (-qx)*vy | |
tz = vmlsq_f32(tz, cqy, vx); // qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
float32x4_t tw = vnegq_f32(vmlaq_f32(vmulq_f32(cqx, vx), cqy, vy)); // - ((-qx)*vx + (-qy)*vy) = qx*vx + qy*vy | |
tw = vmlsq_f32(tw, cqz, vz); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// Step 2: res_vec = (t * q)_vec | |
// result = (tw, tx, ty, tz) * (qw, qx, qy, qz) | |
// res_x = tw*qx + tx*qw + ty*qz - tz*qy | |
// res_y = tw*qy + ty*qw + tz*qx - tx*qz | |
// res_z = tw*qz + ty*qw + tx*qy - ty*qx | |
float32x4_t res_x = vmlaq_f32(vmulq_f32(tw, qx), tx, qw); | |
res_x = vmlsq_f32(res_x, tz, qy); | |
res_x = vmlaq_f32(res_x, ty, qz); // Corrected: tw*qx + tx*qw + ty*qz - tz*qy | |
float32x4_t res_y = vmlaq_f32(vmulq_f32(tw, qy), ty, qw); | |
res_y = vmlsq_f32(res_y, tx, qz); | |
res_y = vmlaq_f32(res_y, tz, qx); // Corrected: tw*qy + ty*qw + tz*qx - tx*qz | |
float32x4_t res_z = vmlaq_f32(vmulq_f32(tw, qz), tz, qw); | |
res_z = vmlsq_f32(res_z, ty, qx); | |
res_z = vmlaq_f32(res_z, tx, qy); // Corrected: tw*qz + tz*qw + tx*qy - ty*qx | |
vst1q_f32(&chn_wnd_lcl_x[c], res_x); | |
vst1q_f32(&chn_wnd_lcl_y[c], res_y); | |
vst1q_f32(&chn_wnd_lcl_z[c], res_z); | |
} | |
// Compute local-space gravity | |
{ | |
float32x4_t vx = vld1q_f32(&cns->chn_grav_x[c]); | |
float32x4_t vy = vld1q_f32(&cns->chn_grav_y[c]); | |
float32x4_t vz = vld1q_f32(&cns->chn_grav_z[c]); | |
// Step 1: t = q_conj * v_world_as_quat | |
float32x4_t tx = vmlaq_f32(vmulq_f32(qw, vx), cqy, vz); | |
tx = vmlsq_f32(tx, cqz, vy); | |
float32x4_t ty = vmlaq_f32(vmulq_f32(qw, vy), cqz, vx); | |
ty = vmlsq_f32(ty, cqx, vz); | |
float32x4_t tz = vmlaq_f32(vmulq_f32(qw, vz), cqx, vy); | |
tz = vmlsq_f32(tz, cqy, vx); | |
float32x4_t tw = vnegq_f32(vmlaq_f32(vmulq_f32(cqx, vx), cqy, vy)); | |
tw = vmlsq_f32(tw, cqz, vz); | |
// Step 2: res_vec = (t * q)_vec | |
float32x4_t res_x = vmlaq_f32(vmulq_f32(tw, qx), tx, qw); | |
res_x = vmlsq_f32(res_x, tz, qy); | |
res_x = vmlaq_f32(res_x, ty, qz); | |
float32x4_t res_y = vmlaq_f32(vmulq_f32(tw, qy), ty, qw); | |
res_y = vmlsq_f32(res_y, tx, qz); | |
res_y = vmlaq_f32(res_y, tz, qx); | |
float32x4_t res_z = vmlaq_f32(vmulq_f32(tw, qz), tz, qw); | |
res_z = vmlsq_f32(res_z, ty, qx); | |
res_z = vmlaq_f32(res_z, tx, qy); | |
vst1q_f32(&chn_grav_lcl_x[c], res_x); | |
vst1q_f32(&chn_grav_lcl_y[c], res_y); | |
vst1q_f32(&chn_grav_lcl_z[c], res_z); | |
} | |
// Compute linear velocity | |
{ | |
float32x4_t curr_x = vld1q_f32(&cns->chn_pos_x[c]); | |
float32x4_t curr_y = vld1q_f32(&cns->chn_pos_y[c]); | |
float32x4_t curr_z = vld1q_f32(&cns->chn_pos_z[c]); | |
float32x4_t prev_x = vld1q_f32(&cns->chn_prev_pos_x[c]); | |
float32x4_t prev_y = vld1q_f32(&cns->chn_prev_pos_y[c]); | |
float32x4_t prev_z = vld1q_f32(&cns->chn_prev_pos_z[c]); | |
float32x4_t vel_x_vec = vmulq_f32(vsubq_f32(curr_x, prev_x), inv_dt_vec); | |
float32x4_t vel_y_vec = vmulq_f32(vsubq_f32(curr_y, prev_y), inv_dt_vec); | |
float32x4_t vel_z_vec = vmulq_f32(vsubq_f32(curr_z, prev_z), inv_dt_vec); | |
vst1q_f32(&vel_x[c], vel_x_vec); | |
vst1q_f32(&vel_y[c], vel_y_vec); | |
vst1q_f32(&vel_z[c], vel_z_vec); | |
} | |
// Compute angular velocity | |
{ | |
float32x4_t qx = vld1q_f32(&cns->chn_quat_x[c]); | |
float32x4_t qy = vld1q_f32(&cns->chn_quat_y[c]); | |
float32x4_t qz = vld1q_f32(&cns->chn_quat_z[c]); | |
float32x4_t qw = vld1q_f32(&cns->chn_quat_w[c]); | |
float32x4_t prev_qx = vld1q_f32(&cns->chn_prev_quat_x[c]); | |
float32x4_t prev_qy = vld1q_f32(&cns->chn_prev_quat_y[c]); | |
float32x4_t prev_qz = vld1q_f32(&cns->chn_prev_quat_z[c]); | |
float32x4_t prev_qw = vld1q_f32(&cns->chn_prev_quat_w[c]); | |
// Step 1: Compute delta quaternion (to - from) | |
float32x4_t dt_qx = vsubq_f32(qx, prev_qx); | |
float32x4_t dt_qy = vsubq_f32(qy, prev_qy); | |
float32x4_t dt_qz = vsubq_f32(qz, prev_qz); | |
float32x4_t dt_qw = vsubq_f32(qw, prev_qw); | |
// Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz) | |
float32x4_t conj_qx = vnegq_f32(prev_qx); | |
float32x4_t conj_qy = vnegq_f32(prev_qy); | |
float32x4_t conj_qz = vnegq_f32(prev_qz); | |
float32x4_t conj_qw = prev_qw; | |
// Step 3: Compute orientation delta = deltaQuat * conj(prev_quat) | |
// Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2) | |
// Result: orientationDelta = (dt_qw, dt_qx, dt_qy, dt_qz) * (conj_qw, conj_qx, conj_qy, conj_qz) | |
float32x4_t orient_qw = vmlsq_f32(vmulq_f32(dt_qw, conj_qw), dt_qx, conj_qx); | |
orient_qw = vmlsq_f32(orient_qw, dt_qy, conj_qy); | |
orient_qw = vmlsq_f32(orient_qw, dt_qz, conj_qz); | |
float32x4_t cross_x = vmlsq_f32(vmulq_f32(dt_qy, conj_qz), dt_qz, conj_qy); | |
float32x4_t cross_y = vmlsq_f32(vmulq_f32(dt_qz, conj_qx), dt_qx, conj_qz); | |
float32x4_t cross_z = vmlsq_f32(vmulq_f32(dt_qx, conj_qy), dt_qy, conj_qx); | |
float32x4_t orient_qx = vmlaq_f32(vmulq_f32(dt_qw, conj_qx), dt_qx, conj_qw); | |
orient_qx = vaddq_f32(orient_qx, cross_x); | |
float32x4_t orient_qy = vmlaq_f32(vmulq_f32(dt_qw, conj_qy), dt_qy, conj_qw); | |
orient_qy = vaddq_f32(orient_qy, cross_y); | |
float32x4_t orient_qz = vmlaq_f32(vmulq_f32(dt_qw, conj_qz), dt_qz, conj_qw); | |
orient_qz = vaddq_f32(orient_qz, cross_z); | |
// Step 4: Compute dot product (to, from) for shortest arc check | |
float32x4_t dot = vmlaq_f32(vmulq_f32(qw, prev_qw), qx, prev_qx); | |
dot = vmlaq_f32(dot, qy, prev_qy); | |
dot = vmlaq_f32(dot, qz, prev_qz); | |
// Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment) | |
uint32x4_t negate_mask = vcltq_f32(dot, zero_vec); // 1 if dot < 0, 0 otherwise | |
float32x4_t sign_selector = vbslq_f32(negate_mask, neg_one_vec, one_vec); // -1.0f if true, 1.0f if false | |
orient_qx = vmulq_f32(orient_qx, sign_selector); | |
orient_qy = vmulq_f32(orient_qy, sign_selector); | |
orient_qz = vmulq_f32(orient_qz, sign_selector); | |
// Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt)) | |
// Inlined vrecip_f32_high_precision | |
float32x4_t rcp_two_dt; | |
rcp_two_dt = vrecpeq_f32(vmulq_f32(two_vec, dt_vec)); | |
rcp_two_dt = vmulq_f32(rcp_two_dt, vrecpsq_f32(vmulq_f32(two_vec, dt_vec), rcp_two_dt)); | |
rcp_two_dt = vmulq_f32(rcp_two_dt, vrecpsq_f32(vmulq_f32(two_vec, dt_vec), rcp_two_dt)); | |
float32x4_t ang_vel_x_vec = vmulq_f32(orient_qx, rcp_two_dt); | |
float32x4_t ang_vel_y_vec = vmulq_f32(orient_qy, rcp_two_dt); | |
float32x4_t ang_vel_z_vec = vmulq_f32(orient_qz, rcp_two_dt); | |
// Store results | |
vst1q_f32(&ang_vel_x[c], ang_vel_x_vec); | |
vst1q_f32(&ang_vel_y[c], ang_vel_y_vec); | |
vst1q_f32(&ang_vel_z[c], ang_vel_z_vec); | |
} | |
} | |
// Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) | |
for (int c = 0; c < MAX_CHNS; ++c) { | |
int base_idx = c * PTC_PER_CHN; | |
struct chn_cfg *cfg = &cns->chn_cfg[c]; | |
// Load precomputed velocities and inertia parameters | |
float32x4_t vel_x_vec = vdupq_n_f32(vel_x[c]); | |
float32x4_t vel_y_vec = vdupq_n_f32(vel_y[c]); | |
float32x4_t vel_z_vec = vdupq_n_f32(vel_z[c]); | |
float32x4_t ang_vel_x_vec = vdupq_n_f32(ang_vel_x[c]); | |
float32x4_t ang_vel_y_vec = vdupq_n_f32(ang_vel_y[c]); | |
float32x4_t ang_vel_z_vec = vdupq_n_f32(ang_vel_z[c]); | |
float32x4_t linear_inertia_vec = vdupq_n_f32(cfg->linear_inertia); | |
float32x4_t angular_inertia_vec = vdupq_n_f32(cfg->angular_inertia); | |
float32x4_t centrifugal_inertia_vec = vdupq_n_f32(cfg->centrifugal_inertia); | |
// Clamp inertia parameters to [0, 1] | |
linear_inertia_vec = vmaxq_f32(vminq_f32(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
angular_inertia_vec = vmaxq_f32(vminq_f32(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
centrifugal_inertia_vec = vmaxq_f32(vminq_f32(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
for (int i = 0; i < PTC_PER_CHN; i += 4) { | |
float32x4_t px = vld1q_f32(&cns->ptc_pos_x[base_idx + i]); | |
float32x4_t py = vld1q_f32(&cns->ptc_pos_y[base_idx + i]); | |
float32x4_t pz = vld1q_f32(&cns->ptc_pos_z[base_idx + i]); | |
float32x4_t p_ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[base_idx + i]); | |
p_ptc_inv_mass = vmaxq_f32(vminq_f32(p_ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
// Linear inertia: v * dt * linear_inertia | |
float32x4_t lin_dt_x = vmulq_f32(vmulq_f32(vel_x_vec, dt_vec), linear_inertia_vec); | |
float32x4_t lin_dt_y = vmulq_f32(vmulq_f32(vel_y_vec, dt_vec), linear_inertia_vec); | |
float32x4_t lin_dt_z = vmulq_f32(vmulq_f32(vel_z_vec, dt_vec), linear_inertia_vec); | |
// Angular inertia: (ω × r) * dt * angular_inertia | |
float32x4_t ang_dt_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, pz), ang_vel_z_vec, py); | |
float32x4_t ang_dt_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, px), ang_vel_x_vec, pz); | |
float32x4_t ang_dt_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, py), ang_vel_y_vec, px); | |
ang_dt_x = vmulq_f32(vmulq_f32(ang_dt_x, dt_vec), angular_inertia_vec); | |
ang_dt_y = vmulq_f32(vmulq_f32(ang_dt_y, dt_vec), angular_inertia_vec); | |
ang_dt_z = vmulq_f32(vmulq_f32(ang_dt_z, dt_vec), angular_inertia_vec); | |
// Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia | |
float32x4_t cross1_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, pz), ang_vel_z_vec, py); | |
float32x4_t cross1_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, px), ang_vel_x_vec, pz); | |
float32x4_t cross1_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, py), ang_vel_y_vec, px); | |
float32x4_t cross2_x = vmlsq_f32(vmulq_f32(ang_vel_y_vec, cross1_z), ang_vel_z_vec, cross1_y); | |
float32x4_t cross2_y = vmlsq_f32(vmulq_f32(ang_vel_z_vec, cross1_x), ang_vel_x_vec, cross1_z); | |
float32x4_t cross2_z = vmlsq_f32(vmulq_f32(ang_vel_x_vec, cross1_y), ang_vel_y_vec, cross1_x); | |
// Calculate displacement: (ω × (ω × r)) * dt^2 | |
float32x4_t base_cent_dt_x = vmulq_f32(cross2_x, dt2_vec); | |
float32x4_t base_cent_dt_y = vmulq_f32(cross2_y, dt2_vec); | |
float32x4_t base_cent_dt_z = vmulq_f32(cross2_z, dt2_vec); | |
// Apply the centrifugal_inertia factor | |
float32x4_t cent_dt_x = vmulq_f32(base_cent_dt_x, centrifugal_inertia_vec); | |
float32x4_t cent_dt_y = vmulq_f32(base_cent_dt_y, centrifugal_inertia_vec); | |
float32x4_t cent_dt_z = vmulq_f32(base_cent_dt_z, centrifugal_inertia_vec); | |
// Total delta | |
float32x4_t dt_x = vaddq_f32(vaddq_f32(lin_dt_x, ang_dt_x), cent_dt_x); | |
float32x4_t dt_y = vaddq_f32(vaddq_f32(lin_dt_y, ang_dt_y), cent_dt_y); | |
float32x4_t dt_z = vaddq_f32(vaddq_f32(lin_dt_z, ang_dt_z), cent_dt_z); | |
// Update positions | |
vst1q_f32(&cns->ptc_pos_x[base_idx + i], vaddq_f32(px, dt_x)); | |
vst1q_f32(&cns->ptc_pos_y[base_idx + i], vaddq_f32(py, dt_y)); | |
vst1q_f32(&cns->ptc_pos_z[base_idx + i], vaddq_f32(pz, dt_z)); | |
} | |
// Update previous transformation | |
cns->chn_prev_pos_x[c] = cns->chn_pos_x[c]; | |
cns->chn_prev_pos_y[c] = cns->chn_pos_y[c]; | |
cns->chn_prev_pos_z[c] = cns->chn_pos_z[c]; | |
cns->chn_prev_quat_x[c] = cns->chn_quat_x[c]; | |
cns->chn_prev_quat_y[c] = cns->chn_quat_y[c]; | |
cns->chn_prev_quat_z[c] = cns->chn_quat_z[c]; | |
cns->chn_prev_quat_w[c] = cns->chn_quat_w[c]; | |
} | |
// Step 2: Verlet Integration | |
for (int c = 0; c < MAX_CHNS; ++c) { | |
int base_idx = c * PTC_PER_CHN; | |
struct chn_cfg *cfg = &cns->chn_cfg[c]; | |
float32x4_t chn_grav_x_vec = vdupq_n_f32(chn_grav_lcl_x[c]); | |
float32x4_t chn_grav_y_vec = vdupq_n_f32(chn_grav_lcl_y[c]); | |
float32x4_t chn_grav_z_vec = vdupq_n_f32(chn_grav_lcl_z[c]); | |
float32x4_t chn_wnd_x_vec = vdupq_n_f32(chn_wnd_lcl_x[c]); | |
float32x4_t chn_wnd_y_vec = vdupq_n_f32(chn_wnd_lcl_y[c]); | |
float32x4_t chn_wnd_z_vec = vdupq_n_f32(chn_wnd_lcl_z[c]); | |
float32x4_t damping_vec = vdupq_n_f32(cfg->damping); | |
for (int i = 0; i < PTC_PER_CHN; i += 4) { | |
float32x4_t p_curr_x = vld1q_f32(&cns->ptc_pos_x[base_idx + i]); | |
float32x4_t p_curr_y = vld1q_f32(&cns->ptc_pos_y[base_idx + i]); | |
float32x4_t p_curr_z = vld1q_f32(&cns->ptc_pos_z[base_idx + i]); | |
float32x4_t ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[base_idx + i]); | |
ptc_inv_mass = vmaxq_f32(vminq_f32(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
float32x4_t p_prev_x = vld1q_f32(&cns->ptc_prev_pos_x[base_idx + i]); | |
float32x4_t p_prev_y = vld1q_f32(&cns->ptc_prev_pos_y[base_idx + i]); | |
float32x4_t p_prev_z = vld1q_f32(&cns->ptc_prev_pos_z[base_idx + i]); | |
float32x4_t force_x = vaddq_f32(chn_grav_x_vec, chn_wnd_x_vec); | |
float32x4_t force_y = vaddq_f32(chn_grav_y_vec, chn_wnd_y_vec); | |
float32x4_t force_z = vaddq_f32(chn_grav_z_vec, chn_wnd_z_vec); | |
float32x4_t acc_x = vmulq_f32(force_x, ptc_inv_mass); | |
float32x4_t acc_y = vmulq_f32(force_y, ptc_inv_mass); | |
float32x4_t acc_z = vmulq_f32(force_z, ptc_inv_mass); | |
float32x4_t vel_x = vsubq_f32(p_curr_x, p_prev_x); | |
float32x4_t vel_y = vsubq_f32(p_curr_y, p_prev_y); | |
float32x4_t vel_z = vsubq_f32(p_curr_z, p_prev_z); | |
float32x4_t damped_vel_x = vmulq_f32(vel_x, damping_vec); | |
float32x4_t damped_vel_y = vmulq_f32(vel_y, damping_vec); | |
float32x4_t damped_vel_z = vmulq_f32(vel_z, damping_vec); | |
float32x4_t term_accel_x = vmulq_f32(acc_x, dt2_vec); | |
float32x4_t term_accel_y = vmulq_f32(acc_y, dt2_vec); | |
float32x4_t term_accel_z = vmulq_f32(acc_z, dt2_vec); | |
float32x4_t new_p_x = vaddq_f32(p_curr_x, vaddq_f32(damped_vel_x, term_accel_x)); | |
float32x4_t new_p_y = vaddq_f32(p_curr_y, vaddq_f32(damped_vel_y, term_accel_y)); | |
float32x4_t new_p_z = vaddq_f32(p_curr_z, vaddq_f32(damped_vel_z, term_accel_z)); | |
vst1q_f32(&cns->ptc_pos_x[base_idx + i], new_p_x); | |
vst1q_f32(&cns->ptc_pos_y[base_idx + i], new_p_y); | |
vst1q_f32(&cns->ptc_pos_z[base_idx + i], new_p_z); | |
vst1q_f32(&cns->ptc_prev_pos_x[base_idx + i], p_curr_x); | |
vst1q_f32(&cns->ptc_prev_pos_y[base_idx + i], p_curr_y); | |
vst1q_f32(&cns->ptc_prev_pos_z[base_idx + i], p_curr_z); | |
} | |
} | |
// Step 3: Distance Constraints | |
for (int c = 0; c < MAX_CHNS; ++c) { | |
int p_base = c * PTC_PER_CHN; | |
int r_base = c * CON_PER_CHN; | |
struct chn_cfg *cfg = &cns->chn_cfg[c]; | |
float32x4_t stiff_vec = vdupq_n_f32(cfg->stiffness); | |
float32x4_t stretch_factor_vec = vdupq_n_f32(cfg->stretch_factor); | |
float32x4_t max_strain_vec_abs = vdupq_n_f32(cfg->max_strain); | |
float32x4_t neg_max_strain_vec_abs = vmulq_f32(max_strain_vec_abs, neg_one_vec); | |
// Loop by 4 for NEON | |
for (int i = 0; i < PTC_PER_CHN; i += 4) { | |
float32x4_t pm = vld1q_f32(&cns->ptc_inv_mass[p_base + i]); | |
pm = vmaxq_f32(vminq_f32(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
vst1q_f32(&px_lcl[i], vld1q_f32(&cns->ptc_pos_x[p_base + i])); | |
vst1q_f32(&py_lcl[i], vld1q_f32(&cns->ptc_pos_y[p_base + i])); | |
vst1q_f32(&pz_lcl[i], vld1q_f32(&cns->ptc_pos_z[p_base + i])); | |
vst1q_f32(&pm_lcl[i], pm); | |
} | |
for (int iter = 0; iter < MAX_ITR; ++iter) { | |
// Loop by 4 for NEON | |
for (int i = 0; i < PTC_PER_CHN; i += 4) { | |
float32x4_t p0x = vld1q_f32(&px_lcl[i]); | |
float32x4_t p0y = vld1q_f32(&py_lcl[i]); | |
float32x4_t p0z = vld1q_f32(&pz_lcl[i]); | |
float32x4_t p0m = vld1q_f32(&pm_lcl[i]); | |
float32x4_t p1x = vld1q_f32(&px_lcl[i + 1]); | |
float32x4_t p1y = vld1q_f32(&py_lcl[i + 1]); | |
float32x4_t p1z = vld1q_f32(&pz_lcl[i + 1]); | |
float32x4_t p1m = vld1q_f32(&pm_lcl[i + 1]); | |
float32x4_t rest_len_vec = vld1q_f32(&cns->ptc_rest_len[r_base + i]); | |
float32x4_t dx = vsubq_f32(p1x, p0x); | |
float32x4_t dy = vsubq_f32(p1y, p0y); | |
float32x4_t dz = vsubq_f32(p1z, p0z); | |
float32x4_t dist_sq = vmulq_f32(dx, dx); | |
dist_sq = vmlaq_f32(dist_sq, dy, dy); | |
dist_sq = vmlaq_f32(dist_sq, dz, dz); | |
float32x4_t inv_dist; | |
float32x4_t x_rsqrt = vmaxq_f32(dist_sq, eps_vec); | |
inv_dist = vrsqrteq_f32(x_rsqrt); | |
inv_dist = vmulq_f32(inv_dist, vrsqrtsq_f32(x_rsqrt, vmulq_f32(inv_dist, inv_dist))); | |
inv_dist = vmulq_f32(inv_dist, vrsqrtsq_f32(x_rsqrt, vmulq_f32(inv_dist, inv_dist))); | |
float32x4_t w_sum = vaddq_f32(p0m, p1m); | |
float32x4_t w_sum_clamped = vmaxq_f32(w_sum, eps_vec); | |
float32x4_t rcp_w_sum; | |
rcp_w_sum = vrecpeq_f32(w_sum_clamped); | |
rcp_w_sum = vmulq_f32(rcp_w_sum, vrecpsq_f32(w_sum_clamped, rcp_w_sum)); | |
rcp_w_sum = vmulq_f32(rcp_w_sum, vrecpsq_f32(w_sum_clamped, rcp_w_sum)); | |
float32x4_t dist; | |
dist = vrecpeq_f32(inv_dist); | |
dist = vmulq_f32(dist, vrecpsq_f32(inv_dist, dist)); | |
dist = vmulq_f32(dist, vrecpsq_f32(inv_dist, dist)); | |
float32x4_t diff = vsubq_f32(dist, rest_len_vec); | |
float32x4_t max_rest_len_eps = vmaxq_f32(rest_len_vec, eps_vec); | |
float32x4_t rcp_max_rest_len_eps; | |
rcp_max_rest_len_eps = vrecpeq_f32(max_rest_len_eps); | |
rcp_max_rest_len_eps = vmulq_f32(rcp_max_rest_len_eps, vrecpsq_f32(max_rest_len_eps, rcp_max_rest_len_eps)); | |
rcp_max_rest_len_eps = vmulq_f32(rcp_max_rest_len_eps, vrecpsq_f32(max_rest_len_eps, rcp_max_rest_len_eps)); | |
float32x4_t strain = vmulq_f32(diff, rcp_max_rest_len_eps); | |
float32x4_t strain_clamped = vmaxq_f32(neg_max_strain_vec_abs, vminq_f32(strain, max_strain_vec_abs)); | |
float32x4_t dyn_stiff_denom = vaddq_f32(one_vec, vabsq_f32(strain_clamped)); | |
float32x4_t rcp_dyn_stiff_denom; | |
rcp_dyn_stiff_denom = vrecpeq_f32(dyn_stiff_denom); | |
rcp_dyn_stiff_denom = vmulq_f32(rcp_dyn_stiff_denom, vrecpsq_f32(dyn_stiff_denom, rcp_dyn_stiff_denom)); | |
rcp_dyn_stiff_denom = vmulq_f32(rcp_dyn_stiff_denom, vrecpsq_f32(dyn_stiff_denom, rcp_dyn_stiff_denom)); | |
float32x4_t dyn_stiff = vmulq_f32(stiff_vec, rcp_dyn_stiff_denom); | |
float32x4_t corr_scl_part = vmulq_f32(dyn_stiff, stretch_factor_vec); | |
corr_scl_part = vmulq_f32(corr_scl_part, rcp_w_sum); | |
float32x4_t corr_magnitude = vmulq_f32(diff, corr_scl_part); | |
uint32x4_t apply_corr_mask = vcgtq_f32(dist_sq, eps_vec); // 1 if dist_sq > eps_vec, 0 otherwise | |
float32x4_t dt_unit_x = vmulq_f32(dx, inv_dist); | |
float32x4_t dt_unit_y = vmulq_f32(dy, inv_dist); | |
float32x4_t dt_unit_z = vmulq_f32(dz, inv_dist); | |
float32x4_t dt_x = vmulq_f32(dt_unit_x, corr_magnitude); | |
float32x4_t dt_y = vmulq_f32(dt_unit_y, corr_magnitude); | |
float32x4_t dt_z = vmulq_f32(dt_unit_z, corr_magnitude); | |
dt_x = vbslq_f32(apply_corr_mask, dt_x, zero_vec); // Apply mask: if mask is 0, set to 0 | |
dt_y = vbslq_f32(apply_corr_mask, dt_y, zero_vec); | |
dt_z = vbslq_f32(apply_corr_mask, dt_z, zero_vec); | |
vst1q_f32(&px_lcl[i], vmlaq_f32(p0x, dt_x, p0m)); | |
vst1q_f32(&py_lcl[i], vmlaq_f32(p0y, dt_y, p0m)); | |
vst1q_f32(&pz_lcl[i], vmlaq_f32(p0z, dt_z, p0m)); | |
vst1q_f32(&px_lcl[i + 1], vmlsq_f32(p1x, dt_x, p1m)); | |
vst1q_f32(&py_lcl[i + 1], vmlsq_f32(p1y, dt_y, p1m)); | |
vst1q_f32(&pz_lcl[i + 1], vmlsq_f32(p1z, dt_z, p1m)); | |
} | |
} | |
for (int i = 0; i < PTC_PER_CHN; i += 4) { | |
vst1q_f32(&cns->ptc_pos_x[p_base + i], vld1q_f32(&px_lcl[i])); | |
vst1q_f32(&cns->ptc_pos_y[p_base + i], vld1q_f32(&py_lcl[i])); | |
vst1q_f32(&cns->ptc_pos_z[p_base + i], vld1q_f32(&pz_lcl[i])); | |
} | |
} | |
// Step 4: Sphere Collisions with Friction | |
for (int c = 0; c < MAX_CHNS; ++c) { | |
int p_base = c * PTC_PER_CHN; | |
int s_base = c * SPH_PER_CHN; | |
struct chn_cfg *cfg = &cns->chn_cfg[c]; | |
float32x4_t fric_vec = vdupq_n_f32(cfg->friction); | |
for (int s = 0; s < SPH_PER_CHN; ++s) { | |
float32x4_t sph_x_vec = vdupq_n_f32(cns->sph_x[s_base + s]); | |
float32x4_t sph_y_vec = vdupq_n_f32(cns->sph_y[s_base + s]); | |
float32x4_t sph_z_vec = vdupq_n_f32(cns->sph_z[s_base + s]); | |
float32x4_t sph_r_vec = vdupq_n_f32(cns->sph_r[s_base + s]); | |
for (int i = 0; i < PTC_PER_CHN; i += 4) { | |
float32x4_t p_curr_x = vld1q_f32(&cns->ptc_pos_x[p_base + i]); | |
float32x4_t p_curr_y = vld1q_f32(&cns->ptc_pos_y[p_base + i]); | |
float32x4_t p_curr_z = vld1q_f32(&cns->ptc_pos_z[p_base + i]); | |
float32x4_t p_prev_x = vld1q_f32(&cns->ptc_prev_pos_x[p_base + i]); | |
float32x4_t p_prev_y = vld1q_f32(&cns->ptc_prev_pos_y[p_base + i]); | |
float32x4_t p_prev_z = vld1q_f32(&cns->ptc_prev_pos_z[p_base + i]); | |
float32x4_t ptc_inv_mass = vld1q_f32(&cns->ptc_inv_mass[p_base + i]); | |
ptc_inv_mass = vmaxq_f32(vminq_f32(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
float32x4_t dx = vsubq_f32(p_curr_x, sph_x_vec); | |
float32x4_t dy = vsubq_f32(p_curr_y, sph_y_vec); | |
float32x4_t dz = vsubq_f32(p_curr_z, sph_z_vec); | |
float32x4_t dist_sq = vmulq_f32(dx, dx); | |
dist_sq = vmlaq_f32(dist_sq, dy, dy); | |
dist_sq = vmlaq_f32(dist_sq, dz, dz); | |
float32x4_t inv_dist_sphere; | |
float32x4_t x_rsqrt_sphere = vmaxq_f32(dist_sq, eps_vec); | |
inv_dist_sphere = vrsqrteq_f32(x_rsqrt_sphere); | |
inv_dist_sphere = vmulq_f32(inv_dist_sphere, vrsqrtsq_f32(x_rsqrt_sphere, vmulq_f32(inv_dist_sphere, inv_dist_sphere))); | |
inv_dist_sphere = vmulq_f32(inv_dist_sphere, vrsqrtsq_f32(x_rsqrt_sphere, vmulq_f32(inv_dist_sphere, inv_dist_sphere))); | |
float32x4_t inv_dist = inv_dist_sphere; | |
float32x4_t dist_sphere; | |
dist_sphere = vrecpeq_f32(inv_dist); | |
dist_sphere = vmulq_f32(dist_sphere, vrecpsq_f32(inv_dist, dist_sphere)); | |
dist_sphere = vmulq_f32(dist_sphere, vrecpsq_f32(inv_dist, dist_sphere)); | |
float32x4_t dist = dist_sphere; | |
float32x4_t pen = vsubq_f32(sph_r_vec, dist); | |
uint32x4_t col_mask = vcgtq_f32(pen, zero_vec); // 1 if pen > 0, 0 otherwise | |
float32x4_t norm_x = vmulq_f32(dx, inv_dist); | |
float32x4_t norm_y = vmulq_f32(dy, inv_dist); | |
float32x4_t norm_z = vmulq_f32(dz, inv_dist); | |
float32x4_t norm_corr_x = vmulq_f32(norm_x, pen); | |
float32x4_t norm_corr_y = vmulq_f32(norm_y, pen); | |
float32x4_t norm_corr_z = vmulq_f32(norm_z, pen); | |
float32x4_t vel_x = vsubq_f32(p_curr_x, p_prev_x); | |
float32x4_t vel_y = vsubq_f32(p_curr_y, p_prev_y); | |
float32x4_t vel_z = vsubq_f32(p_curr_z, p_prev_z); | |
float32x4_t vel_dot_norm = vmulq_f32(vel_x, norm_x); | |
vel_dot_norm = vmlaq_f32(vel_dot_norm, vel_y, norm_y); | |
vel_dot_norm = vmlaq_f32(vel_dot_norm, vel_z, norm_z); | |
float32x4_t tan_x = vmlsq_f32(vel_x, vel_dot_norm, norm_x); | |
float32x4_t tan_y = vmlsq_f32(vel_y, vel_dot_norm, norm_y); | |
float32x4_t tan_z = vmlsq_f32(vel_z, vel_dot_norm, norm_z); | |
float32x4_t tan_mag_sq = vmulq_f32(tan_x, tan_x); | |
tan_mag_sq = vmlaq_f32(tan_mag_sq, tan_y, tan_y); | |
tan_mag_sq = vmlaq_f32(tan_mag_sq, tan_z, tan_z); | |
float32x4_t inv_tan_mag_sphere; | |
float32x4_t x_rsqrt_tan_mag = vmaxq_f32(tan_mag_sq, eps_vec); | |
inv_tan_mag_sphere = vrsqrteq_f32(x_rsqrt_tan_mag); | |
inv_tan_mag_sphere = vmulq_f32(inv_tan_mag_sphere, vrsqrtsq_f32(x_rsqrt_tan_mag, vmulq_f32(inv_tan_mag_sphere, inv_tan_mag_sphere))); | |
inv_tan_mag_sphere = vmulq_f32(inv_tan_mag_sphere, vrsqrtsq_f32(x_rsqrt_tan_mag, vmulq_f32(inv_tan_mag_sphere, inv_tan_mag_sphere))); | |
float32x4_t inv_tan_mag = inv_tan_mag_sphere; | |
float32x4_t fric_mag = vmulq_f32(pen, fric_vec); | |
float32x4_t fric_x = vmulq_f32(vmulq_f32(tan_x, inv_tan_mag), fric_mag); | |
float32x4_t fric_y = vmulq_f32(vmulq_f32(tan_y, inv_tan_mag), fric_mag); | |
float32x4_t fric_z = vmulq_f32(vmulq_f32(tan_z, inv_tan_mag), fric_mag); | |
float32x4_t total_corr_x = vsubq_f32(norm_corr_x, fric_x); | |
float32x4_t total_corr_y = vsubq_f32(norm_corr_y, fric_y); | |
float32x4_t total_corr_z = vsubq_f32(norm_corr_z, fric_z); | |
total_corr_x = vbslq_f32(col_mask, total_corr_x, zero_vec); | |
total_corr_y = vbslq_f32(col_mask, total_corr_y, zero_vec); | |
total_corr_z = vbslq_f32(col_mask, total_corr_z, zero_vec); | |
total_corr_x = vmulq_f32(total_corr_x, ptc_inv_mass); | |
total_corr_y = vmulq_f32(total_corr_y, ptc_inv_mass); | |
total_corr_z = vmulq_f32(total_corr_z, ptc_inv_mass); | |
float32x4_t new_p_x = vaddq_f32(p_curr_x, total_corr_x); | |
float32x4_t new_p_y = vaddq_f32(p_curr_y, total_corr_y); | |
float32x4_t new_p_z = vaddq_f32(p_curr_z, total_corr_z); | |
vst1q_f32(&cns->ptc_pos_x[p_base + i], new_p_x); | |
vst1q_f32(&cns->ptc_pos_y[p_base + i], new_p_y); | |
vst1q_f32(&cns->ptc_pos_z[p_base + i], new_p_z); | |
} | |
} | |
} | |
// Step 5: Motion Constraints | |
for (int c = 0; c < MAX_CHNS; ++c) { | |
int p_base = c * PTC_PER_CHN; | |
// Loop by 4 for NEON | |
for (int i = 0; i < PTC_PER_CHN; i += 4) { | |
float32x4_t px = vld1q_f32(&cns->ptc_pos_x[p_base + i]); | |
float32x4_t py = vld1q_f32(&cns->ptc_pos_y[p_base + i]); | |
float32x4_t pz = vld1q_f32(&cns->ptc_pos_z[p_base + i]); | |
float32x4_t pm = vld1q_f32(&cns->ptc_inv_mass[p_base + i]); | |
pm = vmaxq_f32(vminq_f32(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
float32x4_t rx = vld1q_f32(&cns->rest_pos_x[p_base + i]); | |
float32x4_t ry = vld1q_f32(&cns->rest_pos_y[p_base + i]); | |
float32x4_t rz = vld1q_f32(&cns->rest_pos_z[p_base + i]); | |
float32x4_t r_vec = vld1q_f32(&cns->motion_radius[p_base + i]); | |
float32x4_t dx = vsubq_f32(px, rx); | |
float32x4_t dy = vsubq_f32(py, ry); | |
float32x4_t dz = vsubq_f32(pz, rz); | |
float32x4_t dist_sq = vmulq_f32(dx, dx); | |
dist_sq = vmlaq_f32(dist_sq, dy, dy); | |
dist_sq = vmlaq_f32(dist_sq, dz, dz); | |
float32x4_t inv_dist_motion; | |
float32x4_t x_rsqrt_motion = vmaxq_f32(dist_sq, eps_vec); | |
inv_dist_motion = vrsqrteq_f32(x_rsqrt_motion); | |
inv_dist_motion = vmulq_f32(inv_dist_motion, vrsqrtsq_f32(x_rsqrt_motion, vmulq_f32(inv_dist_motion, inv_dist_motion))); | |
inv_dist_motion = vmulq_f32(inv_dist_motion, vrsqrtsq_f32(x_rsqrt_motion, vmulq_f32(inv_dist_motion, inv_dist_motion))); | |
float32x4_t inv_dist = inv_dist_motion; | |
float32x4_t dist_motion; | |
dist_motion = vrecpeq_f32(inv_dist); | |
dist_motion = vmulq_f32(dist_motion, vrecpsq_f32(inv_dist, dist_motion)); | |
dist_motion = vmulq_f32(dist_motion, vrecpsq_f32(inv_dist, dist_motion)); | |
float32x4_t dist = dist_motion; | |
float32x4_t pen = vsubq_f32(dist, r_vec); | |
uint32x4_t mask = vcgtq_f32(pen, zero_vec); // 1 if pen > 0, 0 otherwise | |
float32x4_t corr_factor = vmulq_f32(pen, inv_dist); | |
corr_factor = vbslq_f32(mask, corr_factor, zero_vec); | |
float32x4_t dt_x = vmulq_f32(dx, corr_factor); | |
float32x4_t dt_y = vmulq_f32(dy, corr_factor); | |
float32x4_t dt_z = vmulq_f32(dz, corr_factor); | |
dt_x = vmulq_f32(dt_x, pm); | |
dt_y = vmulq_f32(dt_y, pm); | |
dt_z = vmulq_f32(dt_z, pm); | |
vst1q_f32(&cns->ptc_pos_x[p_base + i], vsubq_f32(px, dt_x)); | |
vst1q_f32(&cns->ptc_pos_y[p_base + i], vsubq_f32(py, dt_y)); | |
vst1q_f32(&cns->ptc_pos_z[p_base + i], vsubq_f32(pz, dt_z)); | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
#include <math.h> | |
enum { | |
MAX_CHAINS = 64, | |
PARTICLES_PER_CHAIN = 16, | |
SPHERES_PER_CHAIN = 8, | |
MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN), | |
MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN), | |
CONSTRAINTS_PER_CHAIN = PARTICLES_PER_CHAIN, | |
MAX_REST_LENGTHS = (MAX_CHAINS * CONSTRAINTS_PER_CHAIN), | |
MAX_ITERATIONS = 4, | |
}; | |
struct chain_cfg { | |
float damping; | |
float friction; | |
float collision_radius; | |
float linear_inertia; // [0, 1], 1.0f = physically correct linear motion | |
float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation | |
float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force | |
float avbd_alpha; // Regularization parameter (e.g., 0.95f) | |
float avbd_beta; // Stiffness ramping control (e.g., 10.0f) | |
float avbd_gamma; // Warm starting scaling (e.g., 0.99f) | |
float avbd_k_start; // Initial stiffness for AVBD (e.g., 100.0f) | |
float avbd_k_max; // Maximum stiffness for AVBD (e.g., 1e6f) | |
} __attribute__((aligned(32))); | |
struct chain_sim { | |
unsigned short free_idx_cnt; | |
unsigned short free_idx[MAX_CHAINS]; | |
struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32))); | |
// Chain global forces (world space gravity and wind) | |
float gravity_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_z[MAX_CHAINS] __attribute__((aligned(32))); | |
// Chain transformations (world space) | |
float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_w[MAX_CHAINS] __attribute__((aligned(32))); | |
// Particle positions (model space) | |
float p_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); | |
float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); | |
float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); | |
float inv_mass[MAX_PARTICLES] __attribute__((aligned(32))); | |
float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); | |
float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); | |
float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); | |
float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32))); | |
float dist_lambda[MAX_REST_LENGTHS] __attribute__((aligned(32))); | |
float dist_k_current[MAX_REST_LENGTHS] __attribute__((aligned(32))); | |
float dist_prev_frame_error[MAX_REST_LENGTHS] __attribute__((aligned(32))); | |
// Sphere positions (model space) | |
float sphere_x[MAX_SPHERES] __attribute__((aligned(32))); | |
float sphere_y[MAX_SPHERES] __attribute__((aligned(32))); | |
float sphere_z[MAX_SPHERES] __attribute__((aligned(32))); | |
float sphere_radius[MAX_SPHERES] __attribute__((aligned(32))); | |
float coll_lambda[MAX_PARTICLES + 1] __attribute__((aligned(32))); | |
float coll_k_current[MAX_PARTICLES + 1] __attribute__((aligned(32))); | |
float coll_prev_frame_error[MAX_PARTICLES + 1] __attribute__((aligned(32))); | |
// Rest positions (model space) | |
float rest_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); | |
float rest_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); | |
float rest_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); | |
float motion_radius[MAX_PARTICLES] __attribute__((aligned(32))); | |
}; | |
void simulate_chains(struct chain_sim *cs, float dt) { | |
// Common SIMD constants | |
const __m256 dt_vec = _mm256_set1_ps(dt); | |
const __m256 dt2_vec = _mm256_set1_ps(dt * dt); | |
const __m256 one_vec = _mm256_set1_ps(1.0f); | |
const __m256 neg_one_vec = _mm256_set1_ps(-1.0f); | |
const __m256 eps_vec = _mm256_set1_ps(1e-6f); | |
const __m256 inv_mass_clamp_min = _mm256_set1_ps(0.0f); | |
const __m256 inv_mass_clamp_max = _mm256_set1_ps(1e6f); | |
const __m256 zero_vec = _mm256_setzero_ps(); | |
const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt); | |
const __m256 two_vec = _mm256_set1_ps(2.0f); | |
const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f); | |
const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f); | |
const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); | |
// Local arrays for PBD solver | |
float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
// Initialize dummy elements | |
px_local[PARTICLES_PER_CHAIN] = 0.0f; | |
py_local[PARTICLES_PER_CHAIN] = 0.0f; | |
pz_local[PARTICLES_PER_CHAIN] = 0.0f; | |
pm_local[PARTICLES_PER_CHAIN] = 0.0f; | |
// Stack arrays for precomputed data | |
float wind_local_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_local_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_local_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float vel_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float vel_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float vel_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32))); | |
// Step 0: Precomputation (SIMD, 8 chains at once) | |
for (int c = 0; c < MAX_CHAINS; c += 8) { | |
// Load chain quaternions | |
__m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]); | |
__m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]); | |
__m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]); | |
__m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]); | |
// Compute local-space wind | |
{ | |
__m256 vx = _mm256_load_ps(&cs->wind_x[c]); | |
__m256 vy = _mm256_load_ps(&cs->wind_y[c]); | |
__m256 vz = _mm256_load_ps(&cs->wind_z[c]); | |
// Create q_conjugate components (qw remains, qx, qy, qz are negated) | |
__m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx | |
__m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy | |
__m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz | |
// Step 1: t = q_conj * v_world_as_quat | |
// t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) | |
// tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
// ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
// tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz | |
tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx | |
ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy | |
tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx | |
tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy | |
tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// Step 2: result_vec = (t * q)_vec | |
// result = (tw, tx, ty, tz) * (qw, qx, qy, qz) | |
// result_x = tw*qx + tx*qw + ty*qz - tz*qy | |
// result_y = tw*qy + ty*qw + tz*qx - tx*qz | |
// result_z = tw*qz + ty*qw + tx*qy - ty*qx | |
__m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); | |
result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x)); | |
__m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); | |
result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y)); | |
__m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); | |
result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z)); | |
_mm256_store_ps(&wind_local_x[c], result_x); | |
_mm256_store_ps(&wind_local_y[c], result_y); | |
_mm256_store_ps(&wind_local_z[c], result_z); | |
} | |
// Compute local-space gravity | |
{ | |
__m256 vx = _mm256_load_ps(&cs->gravity_x[c]); | |
__m256 vy = _mm256_load_ps(&cs->gravity_y[c]); | |
__m256 vz = _mm256_load_ps(&cs->gravity_z[c]); | |
// Create q_conjugate components (qw remains, qx, qy, qz are negated) | |
__m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx | |
__m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy | |
__m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz | |
// Step 1: t = q_conj * v_world_as_quat | |
// t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) | |
// tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
// ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
// tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz | |
tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx | |
ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy | |
tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx | |
tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy | |
tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// Step 2: result_vec = (t * q)_vec | |
// result = (tw, tx, ty, tz) * (qw, qx, qy, qz) | |
// result_x = tw*qx + tx*qw + ty*qz - tz*qy | |
// result_y = tw*qy + ty*qw + tz*qx - tx*qz | |
// result_z = tw*qz + ty*qw + tx*qy - ty*qx | |
__m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); | |
result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x)); | |
__m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); | |
result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y)); | |
__m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); | |
result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z)); | |
_mm256_store_ps(&gravity_local_x[c], result_x); | |
_mm256_store_ps(&gravity_local_y[c], result_y); | |
_mm256_store_ps(&gravity_local_z[c], result_z); | |
} | |
// Compute linear velocity | |
{ | |
__m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]); | |
__m256 curr_y = _mm256_load_ps(&cs->chain_pos_y[c]); | |
__m256 curr_z = _mm256_load_ps(&cs->chain_pos_z[c]); | |
__m256 prev_x = _mm256_load_ps(&cs->prev_chain_pos_x[c]); | |
__m256 prev_y = _mm256_load_ps(&cs->prev_chain_pos_y[c]); | |
__m256 prev_z = _mm256_load_ps(&cs->prev_chain_pos_z[c]); | |
__m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec); | |
__m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec); | |
__m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec); | |
_mm256_store_ps(&vel_x[c], vel_x_vec); | |
_mm256_store_ps(&vel_y[c], vel_y_vec); | |
_mm256_store_ps(&vel_z[c], vel_z_vec); | |
} | |
// Compute angular velocity | |
{ | |
__m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]); | |
__m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]); | |
__m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]); | |
__m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]); | |
__m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]); | |
__m256 prev_qy = _mm256_load_ps(&cs->prev_chain_quat_y[c]); | |
__m256 prev_qz = _mm256_load_ps(&cs->prev_chain_quat_z[c]); | |
__m256 prev_qw = _mm256_load_ps(&cs->prev_chain_quat_w[c]); | |
// Step 1: Compute delta quaternion (to - from) | |
__m256 delta_qx = _mm256_sub_ps(qx, prev_qx); | |
__m256 delta_qy = _mm256_sub_ps(qy, prev_qy); | |
__m256 delta_qz = _mm256_sub_ps(qz, prev_qz); | |
__m256 delta_qw = _mm256_sub_ps(qw, prev_qw); | |
// Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz) | |
__m256 conj_qx = _mm256_sub_ps(zero_vec, prev_qx); | |
__m256 conj_qy = _mm256_sub_ps(zero_vec, prev_qy); | |
__m256 conj_qz = _mm256_sub_ps(zero_vec, prev_qz); | |
__m256 conj_qw = prev_qw; | |
// Step 3: Compute orientation delta = deltaQuat * conj(prev_quat) | |
// Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2) | |
// Result: orientationDelta = (delta_qw, delta_qx, delta_qy, delta_qz) * (conj_qw, conj_qx, conj_qy, conj_qz) | |
__m256 orient_qw = _mm256_fnmadd_ps(delta_qx, conj_qx, _mm256_mul_ps(delta_qw, conj_qw)); | |
orient_qw = _mm256_fnmadd_ps(delta_qy, conj_qy, orient_qw); | |
orient_qw = _mm256_fnmadd_ps(delta_qz, conj_qz, orient_qw); | |
__m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(delta_qy, conj_qz), _mm256_mul_ps(delta_qz, conj_qy)); | |
__m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(delta_qz, conj_qx), _mm256_mul_ps(delta_qx, conj_qz)); | |
__m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(delta_qx, conj_qy), _mm256_mul_ps(delta_qy, conj_qx)); | |
__m256 orient_qx = _mm256_fmadd_ps(delta_qw, conj_qx, _mm256_mul_ps(delta_qx, conj_qw)); | |
orient_qx = _mm256_add_ps(orient_qx, cross_x); | |
__m256 orient_qy = _mm256_fmadd_ps(delta_qw, conj_qy, _mm256_mul_ps(delta_qy, conj_qw)); | |
orient_qy = _mm256_add_ps(orient_qy, cross_y); | |
__m256 orient_qz = _mm256_fmadd_ps(delta_qw, conj_qz, _mm256_mul_ps(delta_qz, conj_qw)); | |
orient_qz = _mm256_add_ps(orient_qz, cross_z); | |
// Step 4: Compute dot product (to, from) for shortest arc check | |
__m256 dot = _mm256_fmadd_ps(qx, prev_qx, _mm256_mul_ps(qw, prev_qw)); | |
dot = _mm256_fmadd_ps(qy, prev_qy, dot); | |
dot = _mm256_fmadd_ps(qz, prev_qz, dot); | |
// Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment) | |
__m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ); | |
__m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec), | |
_mm256_andnot_ps(negate_mask, one_vec)); | |
orient_qx = _mm256_mul_ps(orient_qx, negate_sign); | |
orient_qy = _mm256_mul_ps(orient_qy, negate_sign); | |
orient_qz = _mm256_mul_ps(orient_qz, negate_sign); | |
// Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt)) | |
__m256 scale = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec)); | |
__m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale); | |
__m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale); | |
__m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale); | |
// Store results | |
_mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec); | |
_mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec); | |
_mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec); | |
} | |
} | |
// Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int base_idx = c * PARTICLES_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
// Load precomputed velocities and inertia parameters | |
__m256 vel_x_vec = _mm256_set1_ps(vel_x[c]); | |
__m256 vel_y_vec = _mm256_set1_ps(vel_y[c]); | |
__m256 vel_z_vec = _mm256_set1_ps(vel_z[c]); | |
__m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]); | |
__m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]); | |
__m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]); | |
__m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia); | |
__m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia); | |
__m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia); | |
// Clamp inertia parameters to [0, 1] | |
linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 px = _mm256_load_ps(&cs->p_pos_x[base_idx + i]); | |
__m256 py = _mm256_load_ps(&cs->p_pos_y[base_idx + i]); | |
__m256 pz = _mm256_load_ps(&cs->p_pos_z[base_idx + i]); | |
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]); | |
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); | |
// Linear inertia: v * dt * linear_inertia | |
__m256 lin_delta_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec); | |
__m256 lin_delta_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec); | |
__m256 lin_delta_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec); | |
// Angular inertia: (ω × r) * dt * angular_inertia | |
__m256 ang_delta_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); | |
__m256 ang_delta_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); | |
__m256 ang_delta_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); | |
ang_delta_x = _mm256_mul_ps(_mm256_mul_ps(ang_delta_x, dt_vec), angular_inertia_vec); | |
ang_delta_y = _mm256_mul_ps(_mm256_mul_ps(ang_delta_y, dt_vec), angular_inertia_vec); | |
ang_delta_z = _mm256_mul_ps(_mm256_mul_ps(ang_delta_z, dt_vec), angular_inertia_vec); | |
// Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia | |
__m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); | |
__m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); | |
__m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); | |
__m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y)); | |
__m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z)); | |
__m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x)); | |
// Calculate displacement: (ω × (ω × r)) * dt^2 | |
__m256 base_cent_delta_x = _mm256_mul_ps(cross2_x, dt2_vec); | |
__m256 base_cent_delta_y = _mm256_mul_ps(cross2_y, dt2_vec); | |
__m256 base_cent_delta_z = _mm256_mul_ps(cross2_z, dt2_vec); | |
// Apply the centrifugal_inertia factor | |
__m256 cent_delta_x = _mm256_mul_ps(base_cent_delta_x, centrifugal_inertia_vec); | |
__m256 cent_delta_y = _mm256_mul_ps(base_cent_delta_y, centrifugal_inertia_vec); | |
__m256 cent_delta_z = _mm256_mul_ps(base_cent_delta_z, centrifugal_inertia_vec); | |
// Total delta | |
__m256 delta_x = _mm256_add_ps(_mm256_add_ps(lin_delta_x, ang_delta_x), cent_delta_x); | |
__m256 delta_y = _mm256_add_ps(_mm256_add_ps(lin_delta_y, ang_delta_y), cent_delta_y); | |
__m256 delta_z = _mm256_add_ps(_mm256_add_ps(lin_delta_z, ang_delta_z), cent_delta_z); | |
// Update positions | |
_mm256_store_ps(&cs->p_pos_x[base_idx + i], _mm256_add_ps(px, delta_x)); | |
_mm256_store_ps(&cs->p_pos_y[base_idx + i], _mm256_add_ps(py, delta_y)); | |
_mm256_store_ps(&cs->p_pos_z[base_idx + i], _mm256_add_ps(pz, delta_z)); | |
} | |
// Update previous transformation | |
cs->prev_chain_pos_x[c] = cs->chain_pos_x[c]; | |
cs->prev_chain_pos_y[c] = cs->chain_pos_y[c]; | |
cs->prev_chain_pos_z[c] = cs->chain_pos_z[c]; | |
cs->prev_chain_quat_x[c] = cs->chain_quat_x[c]; | |
cs->prev_chain_quat_y[c] = cs->chain_quat_y[c]; | |
cs->prev_chain_quat_z[c] = cs->chain_quat_z[c]; | |
cs->prev_chain_quat_w[c] = cs->chain_quat_w[c]; | |
} | |
// Step 2: Verlet Integration | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int base_idx = c * PARTICLES_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
__m256 gravity_x_vec = _mm256_set1_ps(gravity_local_x[c]); | |
__m256 gravity_y_vec = _mm256_set1_ps(gravity_local_y[c]); | |
__m256 gravity_z_vec = _mm256_set1_ps(gravity_local_z[c]); | |
__m256 wind_x_vec = _mm256_set1_ps(wind_local_x[c]); | |
__m256 wind_y_vec = _mm256_set1_ps(wind_local_y[c]); | |
__m256 wind_z_vec = _mm256_set1_ps(wind_local_z[c]); | |
__m256 damping_vec = _mm256_set1_ps(cfg->damping); | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[base_idx + i]); | |
__m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[base_idx + i]); | |
__m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[base_idx + i]); | |
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]); | |
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); | |
__m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[base_idx + i]); | |
__m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[base_idx + i]); | |
__m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[base_idx + i]); | |
__m256 force_x = _mm256_add_ps(gravity_x_vec, wind_x_vec); | |
__m256 force_y = _mm256_add_ps(gravity_y_vec, wind_y_vec); | |
__m256 force_z = _mm256_add_ps(gravity_z_vec, wind_z_vec); | |
__m256 acc_x = _mm256_mul_ps(force_x, p_inv_mass); | |
__m256 acc_y = _mm256_mul_ps(force_y, p_inv_mass); | |
__m256 acc_z = _mm256_mul_ps(force_z, p_inv_mass); | |
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); | |
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); | |
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); | |
__m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec); | |
__m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec); | |
__m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec); | |
__m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec); | |
__m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec); | |
__m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec); | |
__m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x)); | |
__m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y)); | |
__m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z)); | |
_mm256_store_ps(&cs->p_pos_x[base_idx + i], new_p_x); | |
_mm256_store_ps(&cs->p_pos_y[base_idx + i], new_p_y); | |
_mm256_store_ps(&cs->p_pos_z[base_idx + i], new_p_z); | |
_mm256_store_ps(&cs->prev_p_pos_x[base_idx + i], p_curr_x); | |
_mm256_store_ps(&cs->prev_p_pos_y[base_idx + i], p_curr_y); | |
_mm256_store_ps(&cs->prev_p_pos_z[base_idx + i], p_curr_z); | |
} | |
} | |
// --- AVBD Warm-starting of lambda and k_current for all distance constraints. | |
for (int c = 0; c < MAX_CHAINS; ++c) { | |
struct chain_cfg* cfg = &cs->chain_configs[c]; | |
__m256 alpha_scalar_vec = _mm256_set1_ps(cfg->avbd_alpha); | |
__m256 gamma_scalar_vec = _mm256_set1_ps(cfg->avbd_gamma); | |
__m256 k_start_scalar_vec = _mm256_set1_ps(cfg->avbd_k_start); | |
// Loop through each block of 8 constraints for warm-starting | |
int r_base = c * CONSTRAINTS_PER_CHAIN; // Base index for this chain's constraints | |
for (int i = 0; i < CONSTRAINTS_PER_CHAIN - 1; i += 8) { | |
// Load current lambda, k_current for 8 constraints simultaneously | |
__m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + i]); | |
__m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + i]); | |
// Apply warm-starting (Eq. 19 from paper) | |
// lambda_new = lambda_prev_frame * alpha * gamma | |
current_lambda_vec = _mm256_mul_ps(current_lambda_vec, _mm256_mul_ps(alpha_scalar_vec, gamma_scalar_vec)); | |
// k_current_new = max(K_START, k_current_prev_frame * gamma) | |
current_k_vec = _mm256_max_ps(k_start_scalar_vec, _mm256_mul_ps(current_k_vec, gamma_scalar_vec)); | |
// Store updated warm-started values back | |
_mm256_store_ps(&cs->dist_lambda[r_base + i], current_lambda_vec); | |
_mm256_store_ps(&cs->dist_k_current[r_base + i], current_k_vec); | |
} | |
} | |
// Warm-starting for Collision Constraints --- | |
for (int c_idx = 0; c_idx < MAX_CHAINS; ++c_idx) { | |
struct chain_cfg* cfg = &cs->chain_configs[c_idx]; | |
__m256 alpha_scalar_vec = _mm256_set1_ps(cfg->avbd_alpha); | |
__m256 gamma_scalar_vec = _mm256_set1_ps(cfg->avbd_gamma); | |
__m256 k_start_scalar_vec = _mm256_set1_ps(cfg->avbd_k_start); | |
int p_base = c_idx * PARTICLES_PER_CHAIN; // Base index for this chain's particles | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += VEC_FLOAT_COUNT) { | |
__m256 current_lambda_vec = _mm256_load_ps(&cs->coll_lambda[p_base + i]); | |
__m256 current_k_vec = _mm256_load_ps(&cs->coll_k_current[p_base + i]); | |
current_lambda_vec = _mm256_mul_ps(current_lambda_vec, _mm256_mul_ps(alpha_scalar_vec, gamma_scalar_vec)); | |
current_k_vec = _mm256_max_ps(k_start_scalar_vec, _mm256_mul_ps(current_k_vec, gamma_scalar_vec)); | |
_mm256_store_ps(&cs->coll_lambda[p_base + i], current_lambda_vec); | |
_mm256_store_ps(&cs->coll_k_current[p_base + i], current_k_vec); | |
} | |
} | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
__m256 alpha_vec = _mm256_set1_ps(cfg->avbd_alpha); | |
__m256 beta_vec = _mm256_set1_ps(cfg->avbd_beta); | |
__m256 k_max_vec = _mm256_set1_ps(cfg->avbd_k_max); | |
int p_base = c * PARTICLES_PER_CHAIN; | |
int r_base = c * CONSTRAINTS_PER_CHAIN; | |
// Load all current particle positions into local buffer (once per chain per iteration) | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
_mm256_store_ps(&px_local[i], _mm256_load_ps(&cs->p_pos_x[p_base + i])); | |
_mm256_store_ps(&py_local[i], _mm256_load_ps(&cs->p_pos_y[p_base + i])); | |
_mm256_store_ps(&pz_local[i], _mm256_load_ps(&cs->p_pos_z[p_base + i])); | |
_mm256_store_ps(&pm_local[i], _mm256_load_ps(&cs->inv_mass[p_base + i])); | |
_mm256_store_ps(&pm_local[i], _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&pm_local[i]), inv_mass_clamp_max), inv_mass_clamp_min)); | |
} | |
for (int iter = 0; iter < MAX_ITERATIONS; iter++) { | |
// First block (i=0 to 7 of particles, processing constraints 0 to 7) | |
{ | |
// p0 is aligned from px_local[0] | |
__m256 p0x = _mm256_load_ps(&px_local[0]); | |
__m256 p0y = _mm256_load_ps(&py_local[0]); | |
__m256 p0z = _mm256_load_ps(&pz_local[0]); | |
__m256 p0m = _mm256_load_ps(&pm_local[0]); | |
// p1 is unaligned from px_local[1] | |
__m256 p1x = _mm256_loadu_ps(&px_local[1]); | |
__m256 p1y = _mm256_loadu_ps(&py_local[1]); | |
__m256 p1z = _mm256_loadu_ps(&pz_local[1]); | |
__m256 p1m = _mm256_loadu_ps(&pm_local[1]); | |
// This corresponds to rest_lengths[r_base + 0] through rest_lengths[r_base + 7] | |
__m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base]); | |
// Calculate inverse sum of masses (rcp_w_sum) | |
__m256 w_sum = _mm256_add_ps(p0m, p1m); | |
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); | |
__m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped); | |
// Calculate current vector and distance between particles | |
__m256 dx_vec = _mm256_sub_ps(p1x, p0x); | |
__m256 dy_vec = _mm256_sub_ps(p1y, p0y); | |
__m256 dz_vec = _mm256_sub_ps(p1z, p0z); | |
__m256 dist_sq = _mm256_mul_ps(dx_vec, dx_vec); | |
dist_sq = _mm256_fmadd_ps(dy_vec, dy_vec, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz_vec, dz_vec, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); // 1/dist | |
__m256 dist = _mm256_rcp_ps(inv_dist); // Current distance | |
// Primal Update for Distance Constraints (Block 1) --- | |
// Load AVBD state for this block of 8 constraints (indices r_base + 0 to r_base + 7) | |
__m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + 0]); | |
__m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + 0]); | |
__m256 prev_frame_error_vec = _mm256_load_ps(&cs->dist_prev_frame_error[r_base + 0]); | |
// Original constraint error C_j*(x) = current_dist - target_length | |
__m256 C_x_original = _mm256_sub_ps(dist, rest_len_vec); | |
// Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) (Eq. 18) | |
__m256 C_x_regularized = _mm256_sub_ps(C_x_original, _mm256_mul_ps(alpha_vec, prev_frame_error_vec)); | |
// Calculate the total force term based on Augmented Lagrangian: -(k * C_reg + lambda) | |
// The effective displacement is proportional to this force divided by effective stiffness (k) and mass. | |
__m256 force_term = _mm256_fmadd_ps(current_k_vec, C_x_regularized, current_lambda_vec); | |
// Calculate the scalar part of the correction, similar to original 'corr_scalar_part' | |
// This is simplified, representing the desired displacement due to the augmented force | |
__m256 effective_corr_scalar_part = _mm256_div_ps(force_term, current_k_vec); // Simplified displacement factor | |
// Normalize difference vector | |
__m256 delta_unit_x = _mm256_mul_ps(dx_vec, inv_dist); | |
__m256 delta_unit_y = _mm256_mul_ps(dy_vec, inv_dist); | |
__m256 delta_unit_z = _mm256_mul_ps(dz_vec, inv_dist); | |
// Calculate the actual displacement vectors | |
// Note the sign inversion from original: -(delta_x * pm0) for p0 and +(delta_x * pm1) for p1 | |
// The 'effective_corr_scalar_part' already implicitly carries the sign of the desired correction. | |
// So, we just multiply by the unit direction and inverse mass. | |
__m256 delta_x = _mm256_mul_ps(delta_unit_x, effective_corr_scalar_part); | |
__m256 delta_y = _mm256_mul_ps(delta_unit_y, effective_corr_scalar_part); | |
__m256 delta_z = _mm256_mul_ps(delta_unit_z, effective_corr_scalar_part); | |
// Apply correction mask (from original code, avoid division by zero artifacts where dist is ~0) | |
__m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); | |
delta_x = _mm256_and_ps(delta_x, apply_corr_mask); | |
delta_y = _mm256_and_ps(delta_y, apply_corr_mask); | |
delta_z = _mm256_and_ps(delta_z, apply_corr_mask); | |
// Update positions in local buffer | |
// p0 moves along -delta (subtract), p1 moves along +delta (add) | |
_mm256_store_ps(&px_local[0], _mm256_sub_ps(p0x, _mm256_mul_ps(delta_x, p0m))); | |
_mm256_store_ps(&py_local[0], _mm256_sub_ps(p0y, _mm256_mul_ps(delta_y, p0m))); | |
_mm256_store_ps(&pz_local[0], _mm256_sub_ps(p0z, _mm256_mul_ps(delta_z, p0m))); | |
_mm256_storeu_ps(&px_local[1], _mm256_add_ps(p1x, _mm256_mul_ps(delta_x, p1m))); | |
_mm256_storeu_ps(&py_local[1], _mm256_add_ps(p1y, _mm256_mul_ps(delta_y, p1m))); | |
_mm256_storeu_ps(&pz_local[1], _mm256_add_ps(p1z, _mm256_mul_ps(delta_z, p1m))); | |
// Dual Update for Distance Constraints (Block 1) --- | |
// Recalculate constraint error *after* position updates for accurate lambda update | |
// Reload updated positions from local buffer to get the 'post-primal' state | |
p0x = _mm256_load_ps(&px_local[0]); | |
p0y = _mm256_load_ps(&py_local[0]); | |
p0z = _mm256_load_ps(&pz_local[0]); | |
p1x = _mm256_loadu_ps(&px_local[1]); | |
p1y = _mm256_loadu_ps(&py_local[1]); | |
p1z = _mm256_loadu_ps(&pz_local[1]); | |
__m256 updated_dx = _mm256_sub_ps(p1x, p0x); | |
__m256 updated_dy = _mm256_sub_ps(p1y, p0y); | |
__m256 updated_dz = _mm256_sub_ps(p1z, p0z); | |
__m256 updated_dist_sq = _mm256_mul_ps(updated_dx, updated_dx); | |
updated_dist_sq = _mm256_fmadd_ps(updated_dy, updated_dy, updated_dist_sq); | |
updated_dist_sq = _mm256_fmadd_ps(updated_dz, updated_dz, updated_dist_sq); | |
__m256 updated_inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(updated_dist_sq, eps_vec)); | |
__m256 updated_dist = _mm256_rcp_ps(updated_inv_dist); | |
__m256 C_x_updated = _mm256_sub_ps(updated_dist, rest_len_vec); // New error after primal update | |
// Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated)) | |
__m256 new_lambda_vec = _mm256_fmadd_ps(current_k_vec, C_x_updated, current_lambda_vec); | |
_mm256_store_ps(&cs->dist_lambda[r_base + 0], new_lambda_vec); // Store back | |
// Update k_current (Stiffness ramping, Eq. 12) | |
// k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|) | |
__m256 abs_C_x_updated = _mm256_and_ps(C_x_updated, abs_mask_ps); // Absolute value using bitmask | |
__m256 k_updated_vec = _mm256_fmadd_ps(beta_vec, abs_C_x_updated, current_k_vec); | |
__m256 new_k_vec = _mm256_min_ps(k_max_vec, k_updated_vec); | |
_mm256_store_ps(&cs->dist_k_current[r_base + 0], new_k_vec); // Store back | |
} | |
// Second block (i=8 to 14 of particles, processing constraints 8 to 14) | |
{ | |
// p0 is aligned from px_local[8] | |
__m256 p0x = _mm256_load_ps(&px_local[8]); | |
__m256 p0y = _mm256_load_ps(&py_local[8]); | |
__m256 p0z = _mm256_load_ps(&pz_local[8]); | |
__m256 p0m = _mm256_load_ps(&pm_local[8]); | |
// p1 is unaligned from px_local[9], *including* the dummy element at px_local[16] | |
// when i=15, p_base+i+1 becomes p_base+16 | |
__m256 p1x = _mm256_loadu_ps(&px_local[9]); | |
__m256 p1y = _mm256_loadu_ps(&py_local[9]); | |
__m256 p1z = _mm256_loadu_ps(&pz_local[9]); | |
__m256 p1m = _mm256_loadu_ps(&pm_local[9]); | |
// This corresponds to rest_lengths[r_base + 8] through rest_lengths[r_base + 15] | |
__m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + 8]); | |
__m256 dx_vec = _mm256_sub_ps(p1x, p0x); | |
__m256 dy_vec = _mm256_sub_ps(p1y, p0y); | |
__m256 dz_vec = _mm256_sub_ps(p1z, p0z); | |
__m256 dist_sq = _mm256_mul_ps(dx_vec, dx_vec); | |
dist_sq = _mm256_fmadd_ps(dy_vec, dy_vec, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz_vec, dz_vec, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 w_sum = _mm256_add_ps(p0m, p1m); | |
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); | |
__m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
// Primal Update for Distance Constraints (Block 2) --- | |
// Load AVBD state for this block of 8 constraints (indices r_base + 8 to r_base + 15) | |
__m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + 8]); | |
__m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + 8]); | |
__m256 prev_frame_error_vec = _mm256_load_ps(&cs->dist_prev_frame_error[r_base + 8]); | |
// Original constraint error C_j*(x) = current_dist - target_length | |
__m256 C_x_original = _mm256_sub_ps(dist, rest_len_vec); | |
// Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) (Eq. 18) | |
__m256 C_x_regularized = _mm256_sub_ps(C_x_original, _mm256_mul_ps(alpha_vec, prev_frame_error_vec)); | |
// Calculate the total force term based on Augmented Lagrangian: -(k * C_reg + lambda) | |
__m256 force_term = _mm256_fmadd_ps(current_k_vec, C_x_regularized, current_lambda_vec); | |
// Calculate the scalar part of the correction, similar to original 'corr_scalar_part' | |
__m256 effective_corr_scalar_part = _mm256_div_ps(force_term, current_k_vec); // Simplified displacement factor | |
__m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); | |
// This 'valid_mask' handles the fact that the last particle doesn't have a *next* particle | |
// for a distance constraint. It effectively sets the correction for the last element (index 7 of the block) to zero. | |
// For PARTICLES_PER_CHAIN = 16, this second block covers constraints for particles 8 to 15. | |
// The constraint for particle 15 (index 7 in this block) is between particle 15 and 16 (the dummy). | |
// The mask effectively disables the constraint correction for the dummy particle. | |
__attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f }; | |
__m256 valid_mask = _mm256_load_ps(valid_mask_array); | |
apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask); | |
__m256 delta_unit_x = _mm256_mul_ps(dx_vec, inv_dist); | |
__m256 delta_unit_y = _mm256_mul_ps(dy_vec, inv_dist); | |
__m256 delta_unit_z = _mm256_mul_ps(dz_vec, inv_dist); | |
__m256 delta_x = _mm256_mul_ps(delta_unit_x, effective_corr_scalar_part); | |
__m256 delta_y = _mm256_mul_ps(delta_unit_y, effective_corr_scalar_part); | |
__m256 delta_z = _mm256_mul_ps(delta_unit_z, effective_corr_scalar_part); | |
delta_x = _mm256_and_ps(delta_x, apply_corr_mask); | |
delta_y = _mm256_and_ps(delta_y, apply_corr_mask); | |
delta_z = _mm256_and_ps(delta_z, apply_corr_mask); | |
// Update positions in local buffer | |
_mm256_store_ps(&px_local[8], _mm256_sub_ps(p0x, _mm256_mul_ps(delta_x, p0m))); | |
_mm256_store_ps(&py_local[8], _mm256_sub_ps(p0y, _mm256_mul_ps(delta_y, p0m))); | |
_mm256_store_ps(&pz_local[8], _mm256_sub_ps(p0z, _mm256_mul_ps(delta_z, p0m))); | |
_mm256_storeu_ps(&px_local[9], _mm256_add_ps(p1x, _mm256_mul_ps(delta_x, p1m))); | |
_mm256_storeu_ps(&py_local[9], _mm256_add_ps(py1, _mm256_mul_ps(delta_y, p1m))); | |
_mm256_storeu_ps(&pz_local[9], _mm256_add_ps(pz1, _mm256_mul_ps(delta_z, p1m))); | |
// Recalculate constraint error *after* position updates for accurate lambda update | |
// Reload updated positions from local buffer | |
p0x = _mm256_load_ps(&px_local[8]); | |
p0y = _mm256_load_ps(&py_local[8]); | |
p0z = _mm256_load_ps(&pz_local[8]); | |
p1x = _mm256_loadu_ps(&px_local[9]); | |
p1y = _mm256_loadu_ps(&py_local[9]); | |
p1z = _mm256_loadu_ps(&pz_local[9]); | |
updated_dx = _mm256_sub_ps(p1x, p0x); | |
updated_dy = _mm256_sub_ps(p1y, p0y); | |
updated_dz = _mm256_sub_ps(p1z, p0z); | |
updated_dist_sq = _mm256_mul_ps(updated_dx, updated_dx); | |
updated_dist_sq = _mm256_fmadd_ps(updated_dy, updated_dy, updated_dist_sq); | |
updated_dist_sq = _mm256_fmadd_ps(updated_dz, updated_dz, updated_dist_sq); | |
updated_inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(updated_dist_sq, eps_vec)); | |
updated_dist = _mm256_rcp_ps(updated_inv_dist); | |
__m256 C_x_updated = _mm256_sub_ps(updated_dist, rest_len_vec); // New error after primal update | |
// Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated)) | |
__m256 new_lambda_vec = _mm256_fmadd_ps(current_k_vec, C_x_updated, current_lambda_vec); | |
_mm256_store_ps(&cs->dist_lambda[r_base + 8], new_lambda_vec); // Store back | |
// Update k_current (Stiffness ramping, Eq. 12) | |
// k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|) | |
__m256 abs_C_x_updated = _mm256_and_ps(C_x_updated, abs_mask_ps); // Absolute value using bitmask | |
__m256 k_updated_vec = _mm256_fmadd_ps(beta_vec, abs_C_x_updated, current_k_vec); | |
__m256 new_k_vec = _mm256_min_ps(k_max_vec, k_updated_vec); | |
_mm256_store_ps(&cs->dist_k_current[r_base + 8], new_k_vec); // Store back | |
} // End of second block | |
} | |
// After all iterations, save the final positions from local buffer to global cs->p_pos_x/y/z | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
_mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_load_ps(&px_local[i])); | |
_mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_load_ps(&py_local[i])); | |
_mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_load_ps(&pz_local[i])); | |
} | |
} | |
// Step 4: Sphere Collisions with Friction | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int p_base = c * PARTICLES_PER_CHAIN; | |
int s_base = c * SPHERES_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
__m256 friction_vec = _mm256_set1_ps(cfg->friction); | |
for (int s = 0; s < SPHERES_PER_CHAIN; s++) { | |
__m256 sphere_x_vec = _mm256_set1_ps(cs->sphere_x[s_base + s]); | |
__m256 sphere_y_vec = _mm256_set1_ps(cs->sphere_y[s_base + s]); | |
__m256 sphere_z_vec = _mm256_set1_ps(cs->sphere_z[s_base + s]); | |
__m256 sphere_r_vec = _mm256_set1_ps(cs->sphere_radius[s_base + s]); | |
__m256 alpha_coll_vec = _mm256_set1_ps(cfg->avbd_alpha); | |
__m256 beta_coll_vec = _mm256_set1_ps(cfg->avbd_beta); | |
__m256 k_max_coll_vec = _mm256_set1_ps(cfg->avbd_k_max); | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]); | |
__m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]); | |
__m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]); | |
__m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[p_base + i]); | |
__m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[p_base + i]); | |
__m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[p_base + i]); | |
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[p_base + i]); | |
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); | |
__m256 dx = _mm256_sub_ps(p_curr_x, sphere_x_vec); | |
__m256 dy = _mm256_sub_ps(p_curr_y, sphere_y_vec); | |
__m256 dz = _mm256_sub_ps(p_curr_z, sphere_z_vec); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 penetration = _mm256_sub_ps(sphere_r_vec, dist); | |
__m256 collision_mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ); | |
__m256 normal_x = _mm256_mul_ps(dx, inv_dist); | |
__m256 normal_y = _mm256_mul_ps(dy, inv_dist); | |
__m256 normal_z = _mm256_mul_ps(dz, inv_dist); | |
// Load AVBD state for this particle block | |
__m256 coll_lambda_vec = _mm256_load_ps(&cs->coll_lambda[p_base + i]); | |
__m256 coll_k_current_vec = _mm256_load_ps(&cs->coll_k_current[p_base + i]); | |
__m256 coll_prev_frame_error_vec = _mm256_load_ps(&cs->coll_prev_frame_error[p_base + i]); | |
// AVBD Primal Update: | |
// Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) | |
__m256 C_x_regularized = _mm256_sub_ps(penetration, _mm256_mul_ps(alpha_coll_vec, coll_prev_frame_error_vec)); | |
// Apply collision_mask to regularized error, so only active constraints contribute. | |
// This ensures C_x_regularized is 0 for non-penetrating particles, making force_term 0. | |
C_x_regularized = _mm256_and_ps(C_x_regularized, collision_mask); | |
// Calculate force term: (k * C_reg + lambda) | |
__m256 force_term = _mm256_fmadd_ps(coll_k_current_vec, C_x_regularized, coll_lambda_vec); | |
// Calculate correction magnitude (delta_p in paper): force_term * inv_mass / k_current | |
// For collision, delta_x = -nablaC_j(x) * (H_j_inv * (lambda_j + K_j C_j(x))). | |
// H_j_inv for a single particle collision is its inverse mass (p_inv_mass). | |
// So, the correction should be along the normal: -normal * (force_term * p_inv_mass) / k_current | |
// The force_term already has the appropriate sign for correction direction when C_x_regularized is positive (penetration). | |
__m256 correction_magnitude_scalar = _mm256_div_ps(_mm256_mul_ps(force_term, p_inv_mass), coll_k_current_vec); | |
// Apply the correction along the normal direction. | |
// The sign of 'penetration' (and thus 'C_x_regularized' and 'force_term') | |
// means positive 'correction_magnitude_scalar' should push OUT of collision. | |
// So we subtract from particle position. | |
__m256 delta_pos_x = _mm256_mul_ps(normal_x, correction_magnitude_scalar); | |
__m256 delta_pos_y = _mm256_mul_ps(normal_y, correction_magnitude_scalar); | |
__m256 delta_pos_z = _mm256_mul_ps(normal_z, correction_magnitude_scalar); | |
// Update positions: p_curr = p_curr - delta_pos (particle moves out of penetration) | |
p_curr_x = _mm256_sub_ps(p_curr_x, delta_pos_x); | |
p_curr_y = _mm256_sub_ps(p_curr_y, delta_pos_y); | |
p_curr_z = _mm256_sub_ps(p_curr_z, delta_pos_z); | |
_mm256_store_ps(&cs->p_pos_x[p_base + i], p_curr_x); | |
_mm256_store_ps(&cs->p_pos_y[p_base + i], p_curr_y); | |
_mm256_store_ps(&cs->p_pos_z[p_base + i], p_curr_z); | |
// Apply friction (post-stabilization step) | |
// Recalculate velocity based on updated positions | |
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); | |
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); | |
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); | |
// Project velocity onto normal to get normal component | |
__m256 vel_dot_normal = _mm256_mul_ps(vel_x, normal_x); | |
vel_dot_normal = _mm256_fmadd_ps(vel_y, normal_y, vel_dot_normal); | |
vel_dot_normal = _mm256_fmadd_ps(vel_z, normal_z, vel_dot_normal); | |
// Get tangential velocity component | |
__m256 tangent_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_normal, normal_x)); | |
__m256 tangent_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_normal, normal_y)); | |
__m256 tangent_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_normal, normal_z)); | |
__m256 tangent_mag_sq = _mm256_mul_ps(tangent_x, tangent_x); | |
tangent_mag_sq = _mm256_fmadd_ps(tangent_y, tangent_y, tangent_mag_sq); | |
tangent_mag_sq = _mm256_fmadd_ps(tangent_z, tangent_z, tangent_mag_sq); | |
__m256 inv_tangent_mag = _mm256_rsqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec)); | |
// Calculate friction force magnitude limit (based on normal force proxy) | |
__m256 friction_mag_limit = _mm256_mul_ps(penetration, friction_vec); | |
// Actual tangential velocity length | |
__m256 actual_tangent_mag = _mm256_mul_ps(_mm256_sqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec)), one_vec); // sqrt(tangent_mag_sq) is needed here | |
// Calculate the scaling factor for friction. If actual_tangent_mag is greater than limit, scale down. | |
__m256 friction_scale = _mm256_div_ps(friction_mag_limit, _mm256_max_ps(actual_tangent_mag, eps_vec)); | |
friction_scale = _mm256_min_ps(friction_scale, one_vec); // Clamp to 1.0 (no acceleration from friction) | |
// Apply friction as a reduction of tangential velocity, converting to position change | |
// The idea is to reduce the tangential displacement that happened *this* iteration. | |
// The original `vel_x/y/z` are already displacements from `p_prev` to `p_curr`. | |
// So, the tangential displacement is `tangent_x/y/z`. | |
// We want to reduce this by `(1 - friction_scale)`. | |
__m256 friction_x_corr = _mm256_mul_ps(tangent_x, friction_scale); | |
__m256 friction_y_corr = _mm256_mul_ps(tangent_y, friction_scale); | |
__m256 friction_z_corr = _mm256_mul_ps(tangent_z, friction_scale); | |
// Only apply if actively colliding | |
friction_x_corr = _mm256_and_ps(friction_x_corr, collision_mask); | |
friction_y_corr = _mm256_and_ps(friction_y_corr, collision_mask); | |
friction_z_corr = _mm256_and_ps(friction_z_corr, collision_mask); | |
// Apply friction by moving the particle along the tangential direction. | |
// If friction_scale is 0 (no friction), no change. If 1 (full friction), moves back to p_prev. | |
p_curr_x = _mm256_sub_ps(p_curr_x, friction_x_corr); | |
p_curr_y = _mm256_sub_ps(p_curr_y, friction_y_corr); | |
p_curr_z = _mm256_sub_ps(p_curr_z, friction_z_corr); | |
_mm256_store_ps(&cs->p_pos_x[p_base + i], p_curr_x); | |
_mm256_store_ps(&cs->p_pos_y[p_base + i], p_curr_y); | |
_mm256_store_ps(&cs->p_pos_z[p_base + i], p_curr_z); | |
// Dual Update for Collision Constraints --- | |
// Recalculate collision error *after* position updates (normal & friction) | |
// Reload updated positions | |
p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]); | |
p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]); | |
p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]); | |
dx = _mm256_sub_ps(p_curr_x, sphere_x_vec); | |
dy = _mm256_sub_ps(p_curr_y, sphere_y_vec); | |
dz = _mm256_sub_ps(p_curr_z, sphere_z_vec); | |
dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
dist = _mm256_rcp_ps(inv_dist); | |
__m256 C_x_updated_collision = _mm256_sub_ps(sphere_r_vec, dist); // New error after primal update | |
// Apply the collision mask again for updates, so only active constraints update their AVBD state | |
C_x_updated_collision = _mm256_and_ps(C_x_updated_collision, collision_mask); | |
// Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated)) | |
__m256 new_coll_lambda_vec = _mm256_fmadd_ps(coll_k_current_vec, C_x_updated_collision, coll_lambda_vec); | |
_mm256_store_ps(&cs->coll_lambda[p_base + i], new_coll_lambda_vec); | |
// Update k_current (Stiffness ramping, Eq. 12) | |
// k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|) | |
__m256 abs_C_x_updated_collision = _mm256_and_ps(C_x_updated_collision, abs_mask_ps); // Absolute value | |
__m256 k_updated_coll_vec = _mm256_fmadd_ps(beta_coll_vec, abs_C_x_updated_collision, coll_k_current_vec); | |
__m256 new_coll_k_vec = _mm256_min_ps(k_max_coll_vec, k_updated_coll_vec); | |
_mm256_store_ps(&cs->coll_k_current[p_base + i], new_coll_k_vec); | |
// --- END AVBD ADDITION --- | |
} | |
} | |
} | |
// Step 5: Motion Constraints | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int p_base = c * PARTICLES_PER_CHAIN; | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 px = _mm256_load_ps(&cs->p_pos_x[p_base + i]); | |
__m256 py = _mm256_load_ps(&cs->p_pos_y[p_base + i]); | |
__m256 pz = _mm256_load_ps(&cs->p_pos_z[p_base + i]); | |
__m256 pm = _mm256_load_ps(&cs->inv_mass[p_base + i]); | |
pm = _mm256_max_ps(_mm256_min_ps(pm, inv_mass_clamp_max), inv_mass_clamp_min); | |
__m256 rx = _mm256_load_ps(&cs->rest_pos_x[p_base + i]); | |
__m256 ry = _mm256_load_ps(&cs->rest_pos_y[p_base + i]); | |
__m256 rz = _mm256_load_ps(&cs->rest_pos_z[p_base + i]); | |
__m256 radius_vec = _mm256_load_ps(&cs->motion_radius[p_base + i]); | |
__m256 dx = _mm256_sub_ps(px, rx); | |
__m256 dy = _mm256_sub_ps(py, ry); | |
__m256 dz = _mm256_sub_ps(pz, rz); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 penetration = _mm256_sub_ps(dist, radius_vec); | |
__m256 mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ); | |
__m256 corr_factor = _mm256_mul_ps(penetration, inv_dist); | |
corr_factor = _mm256_and_ps(corr_factor, mask); | |
__m256 delta_x = _mm256_mul_ps(dx, corr_factor); | |
__m256 delta_y = _mm256_mul_ps(dy, corr_factor); | |
__m256 delta_z = _mm256_mul_ps(dz, corr_factor); | |
delta_x = _mm256_mul_ps(delta_x, pm); | |
delta_y = _mm256_mul_ps(delta_y, pm); | |
delta_z = _mm256_mul_ps(delta_z, pm); | |
_mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_sub_ps(px, delta_x)); | |
_mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_sub_ps(py, delta_y)); | |
_mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_sub_ps(pz, delta_z)); | |
} | |
} | |
// Store Final Errors for Next Frame's Regularization --- | |
// This happens once at the very end of simulate_chains, after all iterations are complete for the frame. | |
for (int c = 0; c < MAX_CHAINS; ++c) { | |
int p_base = c * PARTICLES_PER_CHAIN; | |
int r_base = c * CONSTRAINTS_PER_CHAIN; | |
// Loop through each block of 8 constraints (distance constraints are N-1 for N particles) | |
for (int i = 0; i < CONSTRAINTS_PER_CHAIN - 1; i += VEC_FLOAT_COUNT) { | |
// Calculate the current error for this distance constraint block | |
// (Similar to C_x_original calculation from Step 3, but using final positions) | |
// Load final positions for p0 and p1 (from global arrays now, as local buffer is stale) | |
__m256 px0 = _mm256_load_ps(&cs->p_pos_x[p_base + i + 0]); | |
__m256 py0 = _mm256_load_ps(&cs->p_pos_y[p_base + i + 0]); | |
__m256 pz0 = _mm256_load_ps(&cs->p_pos_z[p_base + i + 0]); | |
__m256 px1 = _mm256_load_ps(&cs->p_pos_x[p_base + i + 1]); // Use _mm256_load_ps as this is from global | |
__m256 py1 = _mm256_load_ps(&cs->p_pos_y[p_base + i + 1]); | |
__m256 pz1 = _mm256_load_ps(&cs->p_pos_z[p_base + i + 1]); | |
__m256 diff_x = _mm256_sub_ps(px1, px0); | |
__m256 diff_y = _mm256_sub_ps(py1, py0); | |
__m256 diff_z = _mm256_sub_ps(pz1, pz0); | |
__m256 dist_sq = _mm256_mul_ps(diff_x, diff_x); | |
dist_sq = _mm256_fmadd_ps(diff_y, diff_y, dist_sq); | |
dist_sq = _mm256_fmadd_ps(diff_z, diff_z, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + i]); | |
__m256 final_error_vec = _mm256_sub_ps(dist, rest_len_vec); | |
_mm256_store_ps(&cs->dist_prev_frame_error[r_base + i], final_error_vec); // Store for next frame's regularization | |
} | |
} | |
for (int c_idx = 0; c_idx < MAX_CHAINS; ++c_idx) { | |
int p_base = c_idx * PARTICLES_PER_CHAIN; | |
int s_base = c_idx * SPHERES_PER_CHAIN; // For accessing sphere data for this chain | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += VEC_FLOAT_COUNT) { | |
__m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]); | |
__m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]); | |
__m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]); | |
__m256 deepest_penetration_vec = neg_flt_max_vec; // Initialize with negative infinity | |
// Loop through all spheres for this chain to find the max penetration for each particle in the block | |
for (int s_idx = 0; s_idx < SPHERES_PER_CHAIN; ++s_idx) { | |
__m256 s_x = _mm256_set1_ps(cs->sphere_x[s_base + s_idx]); | |
__m256 s_y = _mm256_set1_ps(cs->sphere_y[s_base + s_idx]); | |
__m256 s_z = _mm256_set1_ps(cs->sphere_z[s_base + s_idx]); | |
__m256 s_r = _mm256_set1_ps(cs->sphere_radius[s_base + s_idx]); | |
__m256 dx_s = _mm256_sub_ps(p_curr_x, s_x); | |
__m256 dy_s = _mm256_sub_ps(p_curr_y, s_y); | |
__m256 dz_s = _mm256_sub_ps(p_curr_z, s_z); | |
__m256 dist_sq_s = _mm256_mul_ps(dx_s, dx_s); | |
dist_sq_s = _mm256_fmadd_ps(dy_s, dy_s, dist_sq_s); | |
dist_sq_s = _mm256_fmadd_ps(dz_s, dz_s, dist_sq_s); | |
__m256 inv_dist_s = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq_s, eps_vec)); | |
__m256 dist_s = _mm256_rcp_ps(inv_dist_s); | |
__m256 current_penetration_s = _mm256_sub_ps(s_r, dist_s); // C(x) = radius - distance | |
// We only care about positive penetration (i.e., actually colliding) | |
current_penetration_s = _mm256_max_ps(zero_vec, current_penetration_s); | |
deepest_penetration_vec = _mm256_max_ps(deepest_penetration_vec, current_penetration_s); | |
} | |
_mm256_store_ps(&cs->coll_prev_frame_error[p_base + i], deepest_penetration_vec); // Store for next frame's regularization | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
#include <math.h> | |
#include <assert.h> | |
#include <string.h> | |
#include <float.h> | |
enum { | |
MAX_CHNS = 64, | |
PTC_PER_CHN = 16, | |
SPH_PER_CHN = 8, | |
MAX_PTC = (MAX_CHNS * PTC_PER_CHN), | |
MAX_SPH = (MAX_CHNS * SPH_PER_CHN), | |
CON_PER_CHN = PTC_PER_CHN, | |
MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN), | |
MAX_ITR = 16, | |
}; | |
struct chn_cfg { | |
float damping; // [0, 1] | |
float stiffness; // [0, 1] | |
float stretch_factor; | |
float max_strain; | |
float friction; // [0, 1] | |
float linear_inertia; // [0, 1], 1.0f = physically correct linear motion | |
float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation | |
float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force | |
}; | |
struct chn_sim { | |
unsigned char free_idx_cnt; | |
unsigned char free_idx[MAX_CHNS]; | |
struct chn_cfg chn_cfg[MAX_CHNS] __attribute__((aligned(32))); | |
// chain global forces (world space gravity and wind) | |
float chn_grav_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_grav_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_grav_z[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_wnd_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_wnd_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_wnd_z[MAX_CHNS] __attribute__((aligned(32))); | |
// chain transformations (world space) | |
float chn_pos_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_pos_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_pos_z[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_quat_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_quat_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_quat_z[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_quat_w[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_prev_pos_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_prev_pos_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_prev_pos_z[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_prev_quat_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_prev_quat_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_prev_quat_z[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_prev_quat_w[MAX_CHNS] __attribute__((aligned(32))); | |
// particle positions (model space) | |
float ptc_pos_x[MAX_PTC] __attribute__((aligned(32))); | |
float ptc_pos_y[MAX_PTC] __attribute__((aligned(32))); | |
float ptc_pos_z[MAX_PTC] __attribute__((aligned(32))); | |
float ptc_inv_mass[MAX_PTC] __attribute__((aligned(32))); | |
float ptc_prev_pos_x[MAX_PTC] __attribute__((aligned(32))); | |
float ptc_prev_pos_y[MAX_PTC] __attribute__((aligned(32))); | |
float ptc_prev_pos_z[MAX_PTC] __attribute__((aligned(32))); | |
float ptc_rest_len[MAX_REST_LEN] __attribute__((aligned(32))); | |
// sphere positions (model space) | |
float sph_x[MAX_SPH] __attribute__((aligned(32))); | |
float sph_y[MAX_SPH] __attribute__((aligned(32))); | |
float sph_z[MAX_SPH] __attribute__((aligned(32))); | |
float sph_r[MAX_SPH] __attribute__((aligned(32))); | |
// rest positions (model space) | |
float rest_pos_x[MAX_PTC] __attribute__((aligned(32))); | |
float rest_pos_y[MAX_PTC] __attribute__((aligned(32))); | |
float rest_pos_z[MAX_PTC] __attribute__((aligned(32))); | |
float motion_radius[MAX_PTC] __attribute__((aligned(32))); | |
}; | |
extern void | |
chn_sim_init(struct chn_sim *cns) { | |
assert(cns); | |
cns->free_idx_cnt = MAX_CHNS; | |
for (int i = 0; i < MAX_CHNS; ++i) { | |
cns->free_idx[i] = MAX_CHNS - i - 1; | |
} | |
for (int i = 0; i < MAX_CHNS; i += 8) { | |
_mm256_store_ps(&cns->chn_quat_w[i], _mm256_set1_ps(1.0f)); | |
_mm256_store_ps(&cns->chn_prev_quat_w[i], _mm256_set1_ps(1.0f)); | |
} | |
} | |
extern int | |
chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *restrict rest_pos, int cnt) { | |
assert(cns); | |
assert(cfg); | |
assert(cns->free_idx_cnt > 0); | |
assert(cnt <= PTC_PER_CHN); | |
assert(cnt > 1); | |
int chn = cns->free_idx[--cns->free_idx_cnt]; | |
cns->chn_cfg[chn] = *cfg; | |
int p_idx = (chn * PTC_PER_CHN); | |
int r_idx = (chn * CON_PER_CHN); | |
for (int i = 0; i < cnt; ++i) { | |
cns->ptc_pos_x[p_idx + i] = rest_pos[i*4+0]; | |
cns->ptc_pos_y[p_idx + i] = rest_pos[i*4+1]; | |
cns->ptc_pos_z[p_idx + i] = rest_pos[i*4+2]; | |
cns->ptc_inv_mass[p_idx + i] = rest_pos[i*4+3]; | |
cns->rest_pos_x[p_idx + i] = rest_pos[i*4+0]; | |
cns->rest_pos_y[p_idx + i] = rest_pos[i*4+1]; | |
cns->rest_pos_z[p_idx + i] = rest_pos[i*4+2]; | |
} | |
for (int i = cnt; i < PTC_PER_CHN; ++i) { | |
cns->ptc_pos_x[p_idx + i] = 0.0f; | |
cns->ptc_pos_y[p_idx + i] = 0.0f; | |
cns->ptc_pos_z[p_idx + i] = 0.0f; | |
} | |
for (int i = 1; i < cnt; ++i) { | |
float dx = rest_pos[i*4+0] - rest_pos[(i-1)*4+0]; | |
float dy = rest_pos[i*4+1] - rest_pos[(i-1)*4+1]; | |
float dz = rest_pos[i*4+2] - rest_pos[(i-1)*4+2]; | |
cns->ptc_rest_len[r_idx + i-1] = sqrtf(dx*dx + dy*dy + dz*dz); | |
} | |
for (int i = cnt; i < CON_PER_CHN; ++i) { | |
cns->ptc_rest_len[r_idx + i] = 0.0f; | |
cns->ptc_rest_len[r_idx + i] = 0.0f; | |
cns->ptc_rest_len[r_idx + i] = 0.0f; | |
} | |
return chn; | |
} | |
extern void | |
chn_sim_del(struct chn_sim *cns, int chn) { | |
assert(cns); | |
assert(chn >= 0); | |
assert(chn < MAX_CHNS); | |
int p_idx = (chn * PTC_PER_CHN); | |
int s_idx = (chn * SPH_PER_CHN); | |
int c_idx = (chn * CON_PER_CHN); | |
cns->free_idx[cns->free_idx_cnt++] = chn; | |
memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0])); | |
cns->chn_grav_x[chn] = 0.0f; | |
cns->chn_grav_y[chn] = 0.0f; | |
cns->chn_grav_z[chn] = 0.0f; | |
cns->chn_wnd_x[chn] = 0.0f; | |
cns->chn_wnd_y[chn] = 0.0f; | |
cns->chn_wnd_z[chn] = 0.0f; | |
cns->chn_pos_x[chn] = 0.0f; | |
cns->chn_pos_y[chn] = 0.0f; | |
cns->chn_pos_z[chn] = 0.0f; | |
cns->chn_quat_x[chn] = 0.0f; | |
cns->chn_quat_y[chn] = 0.0f; | |
cns->chn_quat_z[chn] = 0.0f; | |
cns->chn_quat_w[chn] = 1.0f; | |
cns->chn_prev_pos_x[chn] = 0.0f; | |
cns->chn_prev_pos_y[chn] = 0.0f; | |
cns->chn_prev_pos_z[chn] = 0.0f; | |
cns->chn_prev_quat_x[chn] = 0.0f; | |
cns->chn_prev_quat_y[chn] = 0.0f; | |
cns->chn_prev_quat_z[chn] = 0.0f; | |
cns->chn_prev_quat_w[chn] = 1.0f; | |
_mm256_store_ps(&cns->ptc_pos_x[p_idx + 0], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->ptc_pos_x[p_idx + 8], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->ptc_pos_y[p_idx + 0], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->ptc_pos_y[p_idx + 8], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->ptc_pos_z[p_idx + 0], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->ptc_pos_z[p_idx + 8], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->ptc_inv_mass[p_idx + 0], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->ptc_inv_mass[p_idx + 8], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->ptc_prev_pos_x[p_idx + 0], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->ptc_prev_pos_x[p_idx + 8], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->ptc_prev_pos_y[p_idx + 0], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->ptc_prev_pos_y[p_idx + 8], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->ptc_prev_pos_z[p_idx + 0], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->ptc_prev_pos_z[p_idx + 8], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->rest_pos_x[p_idx + 0], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->rest_pos_x[p_idx + 8], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->rest_pos_y[p_idx + 0], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->rest_pos_y[p_idx + 8], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->rest_pos_z[p_idx + 0], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->rest_pos_z[p_idx + 8], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->motion_radius[p_idx + 0], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->motion_radius[p_idx + 8], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->sph_x[s_idx], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->sph_y[s_idx], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->sph_z[s_idx], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->sph_r[s_idx], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->ptc_rest_len[c_idx + 0], _mm256_set1_ps(0.0f)); | |
_mm256_store_ps(&cns->ptc_rest_len[c_idx + 8], _mm256_set1_ps(0.0f)); | |
} | |
extern void | |
chn_sim_mov(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { | |
assert(cns); | |
assert(pos3); | |
assert(rot4); | |
assert(chn >= 0); | |
assert(chn < MAX_CHNS); | |
cns->chn_pos_x[chn] = pos3[0]; | |
cns->chn_pos_y[chn] = pos3[1]; | |
cns->chn_pos_z[chn] = pos3[2]; | |
cns->chn_quat_x[chn] = rot4[0]; | |
cns->chn_quat_y[chn] = rot4[1]; | |
cns->chn_quat_z[chn] = rot4[2]; | |
cns->chn_quat_w[chn] = rot4[3]; | |
} | |
extern void | |
chn_sim_tel(struct chn_sim *cns, int chn, const float *restrict pos3, const float *restrict rot4) { | |
assert(cns); | |
assert(pos3); | |
assert(rot4); | |
assert(chn >= 0); | |
assert(chn < MAX_CHNS); | |
cns->chn_pos_x[chn] = cns->chn_prev_pos_x[chn] = pos3[0]; | |
cns->chn_pos_y[chn] = cns->chn_prev_pos_y[chn] = pos3[1]; | |
cns->chn_pos_z[chn] = cns->chn_prev_pos_z[chn] = pos3[2]; | |
cns->chn_quat_x[chn] = cns->chn_prev_quat_x[chn] = rot4[0]; | |
cns->chn_quat_y[chn] = cns->chn_prev_quat_y[chn] = rot4[1]; | |
cns->chn_quat_z[chn] = cns->chn_prev_quat_z[chn] = rot4[2]; | |
cns->chn_quat_w[chn] = cns->chn_prev_quat_w[chn] = rot4[3]; | |
} | |
extern void | |
chn_sim_grav(struct chn_sim *cns, int chn, const float *restrict grav3) { | |
assert(cns); | |
assert(grav3); | |
assert(chn >= 0); | |
assert(chn < MAX_CHNS); | |
cns->chn_grav_x[chn] = grav3[0]; | |
cns->chn_grav_y[chn] = grav3[1]; | |
cns->chn_grav_z[chn] = grav3[2]; | |
} | |
extern void | |
chn_sim_wind(struct chn_sim *cns, int chn, const float *restrict wind3) { | |
assert(cns); | |
assert(wind3); | |
assert(chn >= 0); | |
assert(chn < MAX_CHNS); | |
cns->chn_wnd_x[chn] = wind3[0]; | |
cns->chn_wnd_y[chn] = wind3[1]; | |
cns->chn_wnd_z[chn] = wind3[2]; | |
} | |
extern void | |
chn_sim_set_sph(struct chn_sim *cns, int chn, const float *spheres, int cnt) { | |
assert(cns); | |
assert(spheres); | |
assert(cnt < MAX_SPH); | |
int s_idx = (chn * SPH_PER_CHN); | |
for (int i = 0; i < cnt; i++) { | |
cns->sph_x[s_idx + i] = spheres[i*4+0]; | |
cns->sph_y[s_idx + i] = spheres[i*4+1]; | |
cns->sph_z[s_idx + i] = spheres[i*4+2]; | |
cns->sph_r[s_idx + i] = spheres[i*4+3]; | |
} | |
for (int i = cnt; i < SPH_PER_CHN; ++i) { | |
cns->sph_x[s_idx + i] = 0; | |
cns->sph_y[s_idx + i] = 0; | |
cns->sph_z[s_idx + i] = 0; | |
cns->sph_r[s_idx + i] = 0; | |
} | |
} | |
extern void | |
chn_sim_con_motion(struct chn_sim *cns, int chn, const float *radius, int cnt) { | |
assert(cns); | |
assert(chn >= 0); | |
assert(chn < MAX_CHNS); | |
assert(cnt >= 0); | |
assert(cnt < PTC_PER_CHN); | |
int p_idx = (chn * PTC_PER_CHN); | |
for (int i = 0; i < cnt; ++i) { | |
cns->motion_radius[p_idx + i] = radius[i]; | |
} | |
for (int i = cnt; i < PTC_PER_CHN; ++i) { | |
cns->motion_radius[p_idx + i] = FLT_MAX; | |
} | |
} | |
extern void | |
chn_sim_run(struct chn_sim *cns, float dt) { | |
// SIMD constants | |
const __m256 dt_vec = _mm256_set1_ps(dt); | |
const __m256 dt2_vec = _mm256_set1_ps(dt * dt); | |
const __m256 one_vec = _mm256_set1_ps(1.0f); | |
const __m256 neg_one_vec = _mm256_set1_ps(-1.0f); | |
const __m256 eps_vec = _mm256_set1_ps(1e-6f); | |
const __m256 ptc_inv_mass_clamp_min = _mm256_set1_ps(0.0f); | |
const __m256 ptc_inv_mass_clamp_max = _mm256_set1_ps(1e6f); | |
const __m256 zero_vec = _mm256_setzero_ps(); | |
const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt); | |
const __m256 two_vec = _mm256_set1_ps(2.0f); | |
const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f); | |
const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f); | |
const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); | |
// Local arrays for PBD solver | |
float px_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); | |
float py_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); | |
float pz_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); | |
float pm_lcl[PTC_PER_CHN + 1] __attribute__((aligned(32))); | |
// Initialize dummy elements | |
px_lcl[PTC_PER_CHN] = 0.0f; | |
py_lcl[PTC_PER_CHN] = 0.0f; | |
pz_lcl[PTC_PER_CHN] = 0.0f; | |
pm_lcl[PTC_PER_CHN] = 0.0f; | |
// Stack arrays for precomputed data | |
float chn_wnd_lcl_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_wnd_lcl_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_wnd_lcl_z[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_grav_lcl_x[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_grav_lcl_y[MAX_CHNS] __attribute__((aligned(32))); | |
float chn_grav_lcl_z[MAX_CHNS] __attribute__((aligned(32))); | |
float vel_x[MAX_CHNS] __attribute__((aligned(32))); | |
float vel_y[MAX_CHNS] __attribute__((aligned(32))); | |
float vel_z[MAX_CHNS] __attribute__((aligned(32))); | |
float ang_vel_x[MAX_CHNS] __attribute__((aligned(32))); | |
float ang_vel_y[MAX_CHNS] __attribute__((aligned(32))); | |
float ang_vel_z[MAX_CHNS] __attribute__((aligned(32))); | |
// Step 0: Precomputation (SIMD, 8 chains at once) | |
for (int c = 0; c < MAX_CHNS; c += 8) { | |
// Load chain quaternions | |
__m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]); | |
__m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]); | |
__m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]); | |
__m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]); | |
// Create q_conjugate components (qw remains, qx, qy, qz are negated) | |
__m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx | |
__m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy | |
__m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz | |
// Compute local-space wind | |
{ | |
__m256 vx = _mm256_load_ps(&cns->chn_wnd_x[c]); | |
__m256 vy = _mm256_load_ps(&cns->chn_wnd_y[c]); | |
__m256 vz = _mm256_load_ps(&cns->chn_wnd_z[c]); | |
// Step 1: t = q_conj * v_world_as_quat | |
// t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) | |
// tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
// ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
// tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz | |
tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx | |
ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy | |
tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx | |
tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy | |
tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// Step 2: res_vec = (t * q)_vec | |
// result = (tw, tx, ty, tz) * (qw, qx, qy, qz) | |
// res_x = tw*qx + tx*qw + ty*qz - tz*qy | |
// res_y = tw*qy + ty*qw + tz*qx - tx*qz | |
// res_z = tw*qz + ty*qw + tx*qy - ty*qx | |
__m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); | |
res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x)); | |
__m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); | |
res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y)); | |
__m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); | |
res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z)); | |
_mm256_store_ps(&chn_wnd_lcl_x[c], res_x); | |
_mm256_store_ps(&chn_wnd_lcl_y[c], res_y); | |
_mm256_store_ps(&chn_wnd_lcl_z[c], res_z); | |
} | |
// Compute local-space gravity | |
{ | |
__m256 vx = _mm256_load_ps(&cns->chn_grav_x[c]); | |
__m256 vy = _mm256_load_ps(&cns->chn_grav_y[c]); | |
__m256 vz = _mm256_load_ps(&cns->chn_grav_z[c]); | |
// Step 1: t = q_conj * v_world_as_quat | |
// t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) | |
// tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
// ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
// tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz | |
tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx | |
ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy | |
tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx | |
tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy | |
tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// Step 2: res_vec = (t * q)_vec | |
// result = (tw, tx, ty, tz) * (qw, qx, qy, qz) | |
// res_x = tw*qx + tx*qw + ty*qz - tz*qy | |
// res_y = tw*qy + ty*qw + tz*qx - tx*qz | |
// res_z = tw*qz + ty*qw + tx*qy - ty*qx | |
__m256 res_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); | |
res_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, res_x)); | |
__m256 res_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); | |
res_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, res_y)); | |
__m256 res_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); | |
res_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, res_z)); | |
_mm256_store_ps(&chn_grav_lcl_x[c], res_x); | |
_mm256_store_ps(&chn_grav_lcl_y[c], res_y); | |
_mm256_store_ps(&chn_grav_lcl_z[c], res_z); | |
} | |
// Compute linear velocity | |
{ | |
__m256 curr_x = _mm256_load_ps(&cns->chn_pos_x[c]); | |
__m256 curr_y = _mm256_load_ps(&cns->chn_pos_y[c]); | |
__m256 curr_z = _mm256_load_ps(&cns->chn_pos_z[c]); | |
__m256 prev_x = _mm256_load_ps(&cns->chn_prev_pos_x[c]); | |
__m256 prev_y = _mm256_load_ps(&cns->chn_prev_pos_y[c]); | |
__m256 prev_z = _mm256_load_ps(&cns->chn_prev_pos_z[c]); | |
__m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec); | |
__m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec); | |
__m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec); | |
_mm256_store_ps(&vel_x[c], vel_x_vec); | |
_mm256_store_ps(&vel_y[c], vel_y_vec); | |
_mm256_store_ps(&vel_z[c], vel_z_vec); | |
} | |
// Compute angular velocity | |
{ | |
__m256 qx = _mm256_load_ps(&cns->chn_quat_x[c]); | |
__m256 qy = _mm256_load_ps(&cns->chn_quat_y[c]); | |
__m256 qz = _mm256_load_ps(&cns->chn_quat_z[c]); | |
__m256 qw = _mm256_load_ps(&cns->chn_quat_w[c]); | |
__m256 prev_qx = _mm256_load_ps(&cns->chn_prev_quat_x[c]); | |
__m256 prev_qy = _mm256_load_ps(&cns->chn_prev_quat_y[c]); | |
__m256 prev_qz = _mm256_load_ps(&cns->chn_prev_quat_z[c]); | |
__m256 prev_qw = _mm256_load_ps(&cns->chn_prev_quat_w[c]); | |
// Step 1: Compute delta quaternion (to - from) | |
__m256 dt_qx = _mm256_sub_ps(qx, prev_qx); | |
__m256 dt_qy = _mm256_sub_ps(qy, prev_qy); | |
__m256 dt_qz = _mm256_sub_ps(qz, prev_qz); | |
__m256 dt_qw = _mm256_sub_ps(qw, prev_qw); | |
// Step 2: Compute conjugate of previous quaternion (qw, -qx, -qy, -qz) | |
__m256 conj_qx = _mm256_sub_ps(zero_vec, prev_qx); | |
__m256 conj_qy = _mm256_sub_ps(zero_vec, prev_qy); | |
__m256 conj_qz = _mm256_sub_ps(zero_vec, prev_qz); | |
__m256 conj_qw = prev_qw; | |
// Step 3: Compute orientation delta = deltaQuat * conj(prev_quat) | |
// Quaternion multiplication: (w1w2 - v1·v2, w1v2 + w2v1 + v1×v2) | |
// Result: orientationDelta = (dt_qw, dt_qx, dt_qy, dt_qz) * (conj_qw, conj_qx, conj_qy, conj_qz) | |
__m256 orient_qw = _mm256_fnmadd_ps(dt_qx, conj_qx, _mm256_mul_ps(dt_qw, conj_qw)); | |
orient_qw = _mm256_fnmadd_ps(dt_qy, conj_qy, orient_qw); | |
orient_qw = _mm256_fnmadd_ps(dt_qz, conj_qz, orient_qw); | |
__m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(dt_qy, conj_qz), _mm256_mul_ps(dt_qz, conj_qy)); | |
__m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(dt_qz, conj_qx), _mm256_mul_ps(dt_qx, conj_qz)); | |
__m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(dt_qx, conj_qy), _mm256_mul_ps(dt_qy, conj_qx)); | |
__m256 orient_qx = _mm256_fmadd_ps(dt_qw, conj_qx, _mm256_mul_ps(dt_qx, conj_qw)); | |
orient_qx = _mm256_add_ps(orient_qx, cross_x); | |
__m256 orient_qy = _mm256_fmadd_ps(dt_qw, conj_qy, _mm256_mul_ps(dt_qy, conj_qw)); | |
orient_qy = _mm256_add_ps(orient_qy, cross_y); | |
__m256 orient_qz = _mm256_fmadd_ps(dt_qw, conj_qz, _mm256_mul_ps(dt_qz, conj_qw)); | |
orient_qz = _mm256_add_ps(orient_qz, cross_z); | |
// Step 4: Compute dot product (to, from) for shortest arc check | |
__m256 dot = _mm256_fmadd_ps(qx, prev_qx, _mm256_mul_ps(qw, prev_qw)); | |
dot = _mm256_fmadd_ps(qy, prev_qy, dot); | |
dot = _mm256_fmadd_ps(qz, prev_qz, dot); | |
// Step 5: Negate orientation delta if dot < 0 (shortest arc adjustment) | |
__m256 negate_mask = _mm256_cmp_ps(dot, zero_vec, _CMP_LT_OQ); | |
__m256 negate_sign = _mm256_or_ps(_mm256_and_ps(negate_mask, neg_one_vec), | |
_mm256_andnot_ps(negate_mask, one_vec)); | |
orient_qx = _mm256_mul_ps(orient_qx, negate_sign); | |
orient_qy = _mm256_mul_ps(orient_qy, negate_sign); | |
orient_qz = _mm256_mul_ps(orient_qz, negate_sign); | |
// Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt)) | |
__m256 scale = _mm256_div_ps(one_vec, _mm256_mul_ps(two_vec, dt_vec)); | |
__m256 ang_vel_x_vec = _mm256_mul_ps(orient_qx, scale); | |
__m256 ang_vel_y_vec = _mm256_mul_ps(orient_qy, scale); | |
__m256 ang_vel_z_vec = _mm256_mul_ps(orient_qz, scale); | |
// Store results | |
_mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec); | |
_mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec); | |
_mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec); | |
} | |
} | |
// Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) | |
for (int c = 0; c < MAX_CHNS; ++c) { | |
int base_idx = c * PTC_PER_CHN; | |
struct chn_cfg *cfg = &cns->chn_cfg[c]; | |
// Load precomputed velocities and inertia parameters | |
__m256 vel_x_vec = _mm256_set1_ps(vel_x[c]); | |
__m256 vel_y_vec = _mm256_set1_ps(vel_y[c]); | |
__m256 vel_z_vec = _mm256_set1_ps(vel_z[c]); | |
__m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]); | |
__m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]); | |
__m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]); | |
__m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia); | |
__m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia); | |
__m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia); | |
// Clamp inertia parameters to [0, 1] | |
linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
for (int i = 0; i < PTC_PER_CHN; i += 8) { | |
__m256 px = _mm256_load_ps(&cns->ptc_pos_x[base_idx + i]); | |
__m256 py = _mm256_load_ps(&cns->ptc_pos_y[base_idx + i]); | |
__m256 pz = _mm256_load_ps(&cns->ptc_pos_z[base_idx + i]); | |
__m256 p_ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[base_idx + i]); | |
p_ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(p_ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
// Linear inertia: v * dt * linear_inertia | |
__m256 lin_dt_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec); | |
__m256 lin_dt_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec); | |
__m256 lin_dt_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec); | |
// Angular inertia: (ω × r) * dt * angular_inertia | |
__m256 ang_dt_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); | |
__m256 ang_dt_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); | |
__m256 ang_dt_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); | |
ang_dt_x = _mm256_mul_ps(_mm256_mul_ps(ang_dt_x, dt_vec), angular_inertia_vec); | |
ang_dt_y = _mm256_mul_ps(_mm256_mul_ps(ang_dt_y, dt_vec), angular_inertia_vec); | |
ang_dt_z = _mm256_mul_ps(_mm256_mul_ps(ang_dt_z, dt_vec), angular_inertia_vec); | |
// Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia | |
__m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); | |
__m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); | |
__m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); | |
__m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y)); | |
__m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z)); | |
__m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x)); | |
// Calculate displacement: (ω × (ω × r)) * dt^2 | |
__m256 base_cent_dt_x = _mm256_mul_ps(cross2_x, dt2_vec); | |
__m256 base_cent_dt_y = _mm256_mul_ps(cross2_y, dt2_vec); | |
__m256 base_cent_dt_z = _mm256_mul_ps(cross2_z, dt2_vec); | |
// Apply the centrifugal_inertia factor | |
__m256 cent_dt_x = _mm256_mul_ps(base_cent_dt_x, centrifugal_inertia_vec); | |
__m256 cent_dt_y = _mm256_mul_ps(base_cent_dt_y, centrifugal_inertia_vec); | |
__m256 cent_dt_z = _mm256_mul_ps(base_cent_dt_z, centrifugal_inertia_vec); | |
// Total delta | |
__m256 dt_x = _mm256_add_ps(_mm256_add_ps(lin_dt_x, ang_dt_x), cent_dt_x); | |
__m256 dt_y = _mm256_add_ps(_mm256_add_ps(lin_dt_y, ang_dt_y), cent_dt_y); | |
__m256 dt_z = _mm256_add_ps(_mm256_add_ps(lin_dt_z, ang_dt_z), cent_dt_z); | |
// Update positions | |
_mm256_store_ps(&cns->ptc_pos_x[base_idx + i], _mm256_add_ps(px, dt_x)); | |
_mm256_store_ps(&cns->ptc_pos_y[base_idx + i], _mm256_add_ps(py, dt_y)); | |
_mm256_store_ps(&cns->ptc_pos_z[base_idx + i], _mm256_add_ps(pz, dt_z)); | |
} | |
// Update previous transformation | |
cns->chn_prev_pos_x[c] = cns->chn_pos_x[c]; | |
cns->chn_prev_pos_y[c] = cns->chn_pos_y[c]; | |
cns->chn_prev_pos_z[c] = cns->chn_pos_z[c]; | |
cns->chn_prev_quat_x[c] = cns->chn_quat_x[c]; | |
cns->chn_prev_quat_y[c] = cns->chn_quat_y[c]; | |
cns->chn_prev_quat_z[c] = cns->chn_quat_z[c]; | |
cns->chn_prev_quat_w[c] = cns->chn_quat_w[c]; | |
} | |
// Step 2: Verlet Integration | |
for (int c = 0; c < MAX_CHNS; ++c) { | |
int base_idx = c * PTC_PER_CHN; | |
struct chn_cfg *cfg = &cns->chn_cfg[c]; | |
__m256 chn_grav_x_vec = _mm256_set1_ps(chn_grav_lcl_x[c]); | |
__m256 chn_grav_y_vec = _mm256_set1_ps(chn_grav_lcl_y[c]); | |
__m256 chn_grav_z_vec = _mm256_set1_ps(chn_grav_lcl_z[c]); | |
__m256 chn_wnd_x_vec = _mm256_set1_ps(chn_wnd_lcl_x[c]); | |
__m256 chn_wnd_y_vec = _mm256_set1_ps(chn_wnd_lcl_y[c]); | |
__m256 chn_wnd_z_vec = _mm256_set1_ps(chn_wnd_lcl_z[c]); | |
__m256 damping_vec = _mm256_set1_ps(cfg->damping); | |
for (int i = 0; i < PTC_PER_CHN; i += 8) { | |
__m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[base_idx + i]); | |
__m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[base_idx + i]); | |
__m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[base_idx + i]); | |
__m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[base_idx + i]); | |
ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
__m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[base_idx + i]); | |
__m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[base_idx + i]); | |
__m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[base_idx + i]); | |
__m256 force_x = _mm256_add_ps(chn_grav_x_vec, chn_wnd_x_vec); | |
__m256 force_y = _mm256_add_ps(chn_grav_y_vec, chn_wnd_y_vec); | |
__m256 force_z = _mm256_add_ps(chn_grav_z_vec, chn_wnd_z_vec); | |
__m256 acc_x = _mm256_mul_ps(force_x, ptc_inv_mass); | |
__m256 acc_y = _mm256_mul_ps(force_y, ptc_inv_mass); | |
__m256 acc_z = _mm256_mul_ps(force_z, ptc_inv_mass); | |
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); | |
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); | |
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); | |
__m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec); | |
__m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec); | |
__m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec); | |
__m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec); | |
__m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec); | |
__m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec); | |
__m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x)); | |
__m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y)); | |
__m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z)); | |
_mm256_store_ps(&cns->ptc_pos_x[base_idx + i], new_p_x); | |
_mm256_store_ps(&cns->ptc_pos_y[base_idx + i], new_p_y); | |
_mm256_store_ps(&cns->ptc_pos_z[base_idx + i], new_p_z); | |
_mm256_store_ps(&cns->ptc_prev_pos_x[base_idx + i], p_curr_x); | |
_mm256_store_ps(&cns->ptc_prev_pos_y[base_idx + i], p_curr_y); | |
_mm256_store_ps(&cns->ptc_prev_pos_z[base_idx + i], p_curr_z); | |
} | |
} | |
// Step 3: Distance Constraints | |
for (int c = 0; c < MAX_CHNS; ++c) { | |
int p_base = c * PTC_PER_CHN; | |
int r_base = c * CON_PER_CHN; | |
struct chn_cfg *cfg = &cns->chn_cfg[c]; | |
__m256 stiff_vec = _mm256_set1_ps(cfg->stiffness); | |
__m256 stretch_factor_vec = _mm256_set1_ps(cfg->stretch_factor); | |
__m256 max_strain_vec_abs = _mm256_set1_ps(cfg->max_strain); | |
__m256 neg_max_strain_vec_abs = _mm256_mul_ps(max_strain_vec_abs, neg_one_vec); | |
for (int i = 0; i < PTC_PER_CHN; i += 8) { | |
__m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]); | |
pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
_mm256_store_ps(&px_lcl[i], _mm256_load_ps(&cns->ptc_pos_x[p_base + i])); | |
_mm256_store_ps(&py_lcl[i], _mm256_load_ps(&cns->ptc_pos_y[p_base + i])); | |
_mm256_store_ps(&pz_lcl[i], _mm256_load_ps(&cns->ptc_pos_z[p_base + i])); | |
_mm256_store_ps(&pm_lcl[i], pm); | |
} | |
for (int iter = 0; iter < MAX_ITR; ++iter) { | |
// First block (i=0 to 7) | |
{ | |
__m256 p0x = _mm256_load_ps(&px_lcl[0]); | |
__m256 p0y = _mm256_load_ps(&py_lcl[0]); | |
__m256 p0z = _mm256_load_ps(&pz_lcl[0]); | |
__m256 p0m = _mm256_load_ps(&pm_lcl[0]); | |
__m256 p1x = _mm256_loadu_ps(&px_lcl[1]); | |
__m256 p1y = _mm256_loadu_ps(&py_lcl[1]); | |
__m256 p1z = _mm256_loadu_ps(&pz_lcl[1]); | |
__m256 p1m = _mm256_loadu_ps(&pm_lcl[1]); | |
__m256 rest_len_vec = _mm256_load_ps(&cns->ptc_rest_len[r_base]); | |
__m256 dx = _mm256_sub_ps(p1x, p0x); | |
__m256 dy = _mm256_sub_ps(p1y, p0y); | |
__m256 dz = _mm256_sub_ps(p1z, p0z); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 w_sum = _mm256_add_ps(p0m, p1m); | |
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); | |
__m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 diff = _mm256_sub_ps(dist, rest_len_vec); | |
__m256 max_rest_len_eps = _mm256_max_ps(rest_len_vec, eps_vec); | |
__m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps)); | |
__m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs)); | |
__m256 dyn_stiff_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps)); | |
__m256 dyn_stiff = _mm256_mul_ps(stiff_vec, _mm256_rcp_ps(dyn_stiff_denom)); | |
__m256 corr_scl_part = _mm256_mul_ps(dyn_stiff, stretch_factor_vec); | |
corr_scl_part = _mm256_mul_ps(corr_scl_part, rcp_w_sum); | |
__m256 corr_magnitude = _mm256_mul_ps(diff, corr_scl_part); | |
__m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); | |
__m256 dt_unit_x = _mm256_mul_ps(dx, inv_dist); | |
__m256 dt_unit_y = _mm256_mul_ps(dy, inv_dist); | |
__m256 dt_unit_z = _mm256_mul_ps(dz, inv_dist); | |
__m256 dt_x = _mm256_mul_ps(dt_unit_x, corr_magnitude); | |
__m256 dt_y = _mm256_mul_ps(dt_unit_y, corr_magnitude); | |
__m256 dt_z = _mm256_mul_ps(dt_unit_z, corr_magnitude); | |
dt_x = _mm256_and_ps(dt_x, apply_corr_mask); | |
dt_y = _mm256_and_ps(dt_y, apply_corr_mask); | |
dt_z = _mm256_and_ps(dt_z, apply_corr_mask); | |
_mm256_store_ps(&px_lcl[0], _mm256_fmadd_ps(dt_x, p0m, p0x)); | |
_mm256_store_ps(&py_lcl[0], _mm256_fmadd_ps(dt_y, p0m, p0y)); | |
_mm256_store_ps(&pz_lcl[0], _mm256_fmadd_ps(dt_z, p0m, p0z)); | |
_mm256_store_ps(&px_lcl[1], _mm256_fnmadd_ps(dt_x, p1m, p1x)); | |
_mm256_store_ps(&py_lcl[1], _mm256_fnmadd_ps(dt_y, p1m, p1y)); | |
_mm256_store_ps(&pz_lcl[1], _mm256_fnmadd_ps(dt_z, p1m, p1z)); | |
} | |
// Second block (i=8 to 14) | |
{ | |
__m256 p0x = _mm256_load_ps(&px_lcl[8]); | |
__m256 p0y = _mm256_load_ps(&py_lcl[8]); | |
__m256 p0z = _mm256_load_ps(&pz_lcl[8]); | |
__m256 p0m = _mm256_load_ps(&pm_lcl[8]); | |
__m256 p1x = _mm256_loadu_ps(&px_lcl[9]); | |
__m256 p1y = _mm256_loadu_ps(&py_lcl[9]); | |
__m256 p1z = _mm256_loadu_ps(&pz_lcl[9]); | |
__m256 p1m = _mm256_loadu_ps(&pm_lcl[9]); | |
__m256 rest_len_vec = _mm256_load_ps(&cns->ptc_rest_len[r_base + 8]); | |
__m256 dx = _mm256_sub_ps(p1x, p0x); | |
__m256 dy = _mm256_sub_ps(p1y, p0y); | |
__m256 dz = _mm256_sub_ps(p1z, p0z); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 w_sum = _mm256_add_ps(p0m, p1m); | |
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); | |
__m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 diff = _mm256_sub_ps(dist, rest_len_vec); | |
__m256 max_rest_len_eps = _mm256_max_ps(rest_len_vec, eps_vec); | |
__m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps)); | |
__m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs)); | |
__m256 dyn_stiff_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps)); | |
__m256 dyn_stiff = _mm256_mul_ps(stiff_vec, _mm256_rcp_ps(dyn_stiff_denom)); | |
__m256 corr_scl_part = _mm256_mul_ps(dyn_stiff, stretch_factor_vec); | |
corr_scl_part = _mm256_mul_ps(corr_scl_part, rcp_w_sum); | |
__m256 corr_magnitude = _mm256_mul_ps(diff, corr_scl_part); | |
__m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); | |
__attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f }; | |
__m256 valid_mask = _mm256_load_ps(valid_mask_array); | |
apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask); | |
__m256 dt_unit_x = _mm256_mul_ps(dx, inv_dist); | |
__m256 dt_unit_y = _mm256_mul_ps(dy, inv_dist); | |
__m256 dt_unit_z = _mm256_mul_ps(dz, inv_dist); | |
__m256 dt_x = _mm256_mul_ps(dt_unit_x, corr_magnitude); | |
__m256 dt_y = _mm256_mul_ps(dt_unit_y, corr_magnitude); | |
__m256 dt_z = _mm256_mul_ps(dt_unit_z, corr_magnitude); | |
dt_x = _mm256_and_ps(dt_x, apply_corr_mask); | |
dt_y = _mm256_and_ps(dt_y, apply_corr_mask); | |
dt_z = _mm256_and_ps(dt_z, apply_corr_mask); | |
_mm256_store_ps(&px_lcl[8], _mm256_fmadd_ps(dt_x, p0m, p0x)); | |
_mm256_store_ps(&py_lcl[8], _mm256_fmadd_ps(dt_y, p0m, p0y)); | |
_mm256_store_ps(&pz_lcl[8], _mm256_fmadd_ps(dt_z, p0m, p0z)); | |
_mm256_storeu_ps(&px_lcl[9], _mm256_fnmadd_ps(dt_x, p1m, p1x)); | |
_mm256_storeu_ps(&py_lcl[9], _mm256_fnmadd_ps(dt_y, p1m, p1y)); | |
_mm256_storeu_ps(&pz_lcl[9], _mm256_fnmadd_ps(dt_z, p1m, p1z)); | |
} | |
} | |
for (int i = 0; i < PTC_PER_CHN; i += 8) { | |
_mm256_store_ps(&cns->ptc_pos_x[p_base + i], _mm256_load_ps(&px_lcl[i])); | |
_mm256_store_ps(&cns->ptc_pos_y[p_base + i], _mm256_load_ps(&py_lcl[i])); | |
_mm256_store_ps(&cns->ptc_pos_z[p_base + i], _mm256_load_ps(&pz_lcl[i])); | |
} | |
} | |
// Step 4: Sphere Collisions with Friction | |
for (int c = 0; c < MAX_CHNS; ++c) { | |
int p_base = c * PTC_PER_CHN; | |
int s_base = c * SPH_PER_CHN; | |
struct chn_cfg *cfg = &cns->chn_cfg[c]; | |
__m256 fric_vec = _mm256_set1_ps(cfg->friction); | |
for (int s = 0; s < SPH_PER_CHN; ++s) { | |
__m256 sph_x_vec = _mm256_set1_ps(cns->sph_x[s_base + s]); | |
__m256 sph_y_vec = _mm256_set1_ps(cns->sph_y[s_base + s]); | |
__m256 sph_z_vec = _mm256_set1_ps(cns->sph_z[s_base + s]); | |
__m256 sph_r_vec = _mm256_set1_ps(cns->sph_r[s_base + s]); | |
for (int i = 0; i < PTC_PER_CHN; i += 8) { | |
__m256 p_curr_x = _mm256_load_ps(&cns->ptc_pos_x[p_base + i]); | |
__m256 p_curr_y = _mm256_load_ps(&cns->ptc_pos_y[p_base + i]); | |
__m256 p_curr_z = _mm256_load_ps(&cns->ptc_pos_z[p_base + i]); | |
__m256 p_prev_x = _mm256_load_ps(&cns->ptc_prev_pos_x[p_base + i]); | |
__m256 p_prev_y = _mm256_load_ps(&cns->ptc_prev_pos_y[p_base + i]); | |
__m256 p_prev_z = _mm256_load_ps(&cns->ptc_prev_pos_z[p_base + i]); | |
__m256 ptc_inv_mass = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]); | |
ptc_inv_mass = _mm256_max_ps(_mm256_min_ps(ptc_inv_mass, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
__m256 dx = _mm256_sub_ps(p_curr_x, sph_x_vec); | |
__m256 dy = _mm256_sub_ps(p_curr_y, sph_y_vec); | |
__m256 dz = _mm256_sub_ps(p_curr_z, sph_z_vec); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 pen = _mm256_sub_ps(sph_r_vec, dist); | |
__m256 col_mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); | |
__m256 norm_x = _mm256_mul_ps(dx, inv_dist); | |
__m256 norm_y = _mm256_mul_ps(dy, inv_dist); | |
__m256 norm_z = _mm256_mul_ps(dz, inv_dist); | |
__m256 norm_corr_x = _mm256_mul_ps(norm_x, pen); | |
__m256 norm_corr_y = _mm256_mul_ps(norm_y, pen); | |
__m256 norm_corr_z = _mm256_mul_ps(norm_z, pen); | |
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); | |
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); | |
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); | |
__m256 vel_dot_norm = _mm256_mul_ps(vel_x, norm_x); | |
vel_dot_norm = _mm256_fmadd_ps(vel_y, norm_y, vel_dot_norm); | |
vel_dot_norm = _mm256_fmadd_ps(vel_z, norm_z, vel_dot_norm); | |
__m256 tan_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_norm, norm_x)); | |
__m256 tan_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_norm, norm_y)); | |
__m256 tan_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_norm, norm_z)); | |
__m256 tan_mag_sq = _mm256_mul_ps(tan_x, tan_x); | |
tan_mag_sq = _mm256_fmadd_ps(tan_y, tan_y, tan_mag_sq); | |
tan_mag_sq = _mm256_fmadd_ps(tan_z, tan_z, tan_mag_sq); | |
__m256 inv_tan_mag = _mm256_rsqrt_ps(_mm256_max_ps(tan_mag_sq, eps_vec)); | |
__m256 fric_mag = _mm256_mul_ps(pen, fric_vec); | |
__m256 fric_x = _mm256_mul_ps(_mm256_mul_ps(tan_x, inv_tan_mag), fric_mag); | |
__m256 fric_y = _mm256_mul_ps(_mm256_mul_ps(tan_y, inv_tan_mag), fric_mag); | |
__m256 fric_z = _mm256_mul_ps(_mm256_mul_ps(tan_z, inv_tan_mag), fric_mag); | |
__m256 total_corr_x = _mm256_sub_ps(norm_corr_x, fric_x); | |
__m256 total_corr_y = _mm256_sub_ps(norm_corr_y, fric_y); | |
__m256 total_corr_z = _mm256_sub_ps(norm_corr_z, fric_z); | |
total_corr_x = _mm256_and_ps(total_corr_x, col_mask); | |
total_corr_y = _mm256_and_ps(total_corr_y, col_mask); | |
total_corr_z = _mm256_and_ps(total_corr_z, col_mask); | |
total_corr_x = _mm256_mul_ps(total_corr_x, ptc_inv_mass); | |
total_corr_y = _mm256_mul_ps(total_corr_y, ptc_inv_mass); | |
total_corr_z = _mm256_mul_ps(total_corr_z, ptc_inv_mass); | |
__m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x); | |
__m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y); | |
__m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z); | |
_mm256_store_ps(&cns->ptc_pos_x[p_base + i], new_p_x); | |
_mm256_store_ps(&cns->ptc_pos_y[p_base + i], new_p_y); | |
_mm256_store_ps(&cns->ptc_pos_z[p_base + i], new_p_z); | |
} | |
} | |
} | |
// Step 5: Motion Constraints | |
for (int c = 0; c < MAX_CHNS; ++c) { | |
int p_base = c * PTC_PER_CHN; | |
for (int i = 0; i < PTC_PER_CHN; i += 8) { | |
__m256 px = _mm256_load_ps(&cns->ptc_pos_x[p_base + i]); | |
__m256 py = _mm256_load_ps(&cns->ptc_pos_y[p_base + i]); | |
__m256 pz = _mm256_load_ps(&cns->ptc_pos_z[p_base + i]); | |
__m256 pm = _mm256_load_ps(&cns->ptc_inv_mass[p_base + i]); | |
pm = _mm256_max_ps(_mm256_min_ps(pm, ptc_inv_mass_clamp_max), ptc_inv_mass_clamp_min); | |
__m256 rx = _mm256_load_ps(&cns->rest_pos_x[p_base + i]); | |
__m256 ry = _mm256_load_ps(&cns->rest_pos_y[p_base + i]); | |
__m256 rz = _mm256_load_ps(&cns->rest_pos_z[p_base + i]); | |
__m256 r_vec = _mm256_load_ps(&cns->motion_radius[p_base + i]); | |
__m256 dx = _mm256_sub_ps(px, rx); | |
__m256 dy = _mm256_sub_ps(py, ry); | |
__m256 dz = _mm256_sub_ps(pz, rz); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 pen = _mm256_sub_ps(dist, r_vec); | |
__m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ); | |
__m256 corr_factor = _mm256_mul_ps(pen, inv_dist); | |
corr_factor = _mm256_and_ps(corr_factor, mask); | |
__m256 dt_x = _mm256_mul_ps(dx, corr_factor); | |
__m256 dt_y = _mm256_mul_ps(dy, corr_factor); | |
__m256 dt_z = _mm256_mul_ps(dz, corr_factor); | |
dt_x = _mm256_mul_ps(dt_x, pm); | |
dt_y = _mm256_mul_ps(dt_y, pm); | |
dt_z = _mm256_mul_ps(dt_z, pm); | |
_mm256_store_ps(&cns->ptc_pos_x[p_base + i], _mm256_sub_ps(px, dt_x)); | |
_mm256_store_ps(&cns->ptc_pos_y[p_base + i], _mm256_sub_ps(py, dt_y)); | |
_mm256_store_ps(&cns->ptc_pos_z[p_base + i], _mm256_sub_ps(pz, dt_z)); | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define MAX_CHAINS 64 | |
#define PARTICLES_PER_CHAIN 16 | |
#define SPHERES_PER_CHAIN 8 | |
#define MAX_PARTICLES (MAX_CHAINS * PARTICLES_PER_CHAIN) | |
#define MAX_SPHERES (MAX_CHAINS * SPHERES_PER_CHAIN) | |
#define CONSTRAINTS_PER_CHAIN PARTICLES_PER_CHAIN | |
#define MAX_REST_LENGTHS (MAX_CHAINS * CONSTRAINTS_PER_CHAIN) | |
#define MAX_ITERATIONS 8 | |
// Define chain configuration structure | |
struct chain_cfg { | |
uniform float damping; | |
uniform float stiffness; | |
uniform float stretch_factor; | |
uniform float max_strain; | |
uniform float friction; | |
uniform float collision_radius; | |
uniform float linear_inertia; // [0, 1] | |
uniform float angular_inertia; // [0, 1] | |
uniform float centrifugal_inertia;// [0, 1] | |
}; | |
// Define chain simulation structure | |
struct chain_sim { | |
uint8 free_idx_cnt; | |
uint8 free_idx[MAX_CHAINS]; | |
uniform struct chain_cfg chain_configs[MAX_CHAINS]; | |
// Chain global forces (world space gravity and wind) | |
uniform float gravity_x[MAX_CHAINS]; | |
uniform float gravity_y[MAX_CHAINS]; | |
uniform float gravity_z[MAX_CHAINS]; | |
uniform float wind_x[MAX_CHAINS]; | |
uniform float wind_y[MAX_CHAINS]; | |
uniform float wind_z[MAX_CHAINS]; | |
// Chain transformations (world space) | |
uniform float chain_pos_x[MAX_CHAINS]; | |
uniform float chain_pos_y[MAX_CHAINS]; | |
uniform float chain_pos_z[MAX_CHAINS]; | |
uniform float chain_quat_x[MAX_CHAINS]; | |
uniform float chain_quat_y[MAX_CHAINS]; | |
uniform float chain_quat_z[MAX_CHAINS]; | |
uniform float chain_quat_w[MAX_CHAINS]; | |
uniform float prev_chain_pos_x[MAX_CHAINS]; | |
uniform float prev_chain_pos_y[MAX_CHAINS]; | |
uniform float prev_chain_pos_z[MAX_CHAINS]; | |
uniform float prev_chain_quat_x[MAX_CHAINS]; | |
uniform float prev_chain_quat_y[MAX_CHAINS]; | |
uniform float prev_chain_quat_z[MAX_CHAINS]; | |
uniform float prev_chain_quat_w[MAX_CHAINS]; | |
// Particle positions (model space) | |
uniform float p_pos_x[MAX_PARTICLES]; | |
uniform float p_pos_y[MAX_PARTICLES]; | |
uniform float p_pos_z[MAX_PARTICLES]; | |
uniform float prev_p_pos_x[MAX_PARTICLES]; | |
uniform float prev_p_pos_y[MAX_PARTICLES]; | |
uniform float prev_p_pos_z[MAX_PARTICLES]; | |
uniform float inv_mass[MAX_PARTICLES]; | |
uniform float rest_lengths[MAX_REST_LENGTHS]; | |
// Sphere positions (model space) | |
uniform float sphere_x[MAX_SPHERES]; | |
uniform float sphere_y[MAX_SPHERES]; | |
uniform float sphere_z[MAX_SPHERES]; | |
uniform float sphere_radius[MAX_SPHERES]; | |
// Rest positions (model space) | |
uniform float rest_pos_x[MAX_PARTICLES]; | |
uniform float rest_pos_y[MAX_PARTICLES]; | |
uniform float rest_pos_z[MAX_PARTICLES]; | |
uniform float motion_radius[MAX_PARTICLES]; | |
}; | |
// ISPC task for chain simulation | |
export void simulate_chains( | |
__attribute__((aligned(32))) uniform float *uniform chain_quat_x_ptr, | |
__attribute__((aligned(32))) uniform float *uniform chain_quat_y_ptr, | |
__attribute__((aligned(32))) uniform float *uniform chain_quat_z_ptr, | |
__attribute__((aligned(32))) uniform float *uniform chain_quat_w_ptr, | |
__attribute__((aligned(32))) uniform float *uniform wind_x_ptr, | |
__attribute__((aligned(32))) uniform float *uniform wind_y_ptr, | |
__attribute__((aligned(32))) uniform float *uniform wind_z_ptr, | |
__attribute__((aligned(32))) uniform float *uniform gravity_x_ptr, | |
__attribute__((aligned(32))) uniform float *uniform gravity_y_ptr, | |
__attribute__((aligned(32))) uniform float *uniform gravity_z_ptr, | |
__attribute__((aligned(32))) uniform float *uniform chain_pos_x_ptr, | |
__attribute__((aligned(32))) uniform float *uniform chain_pos_y_ptr, | |
__attribute__((aligned(32))) uniform float *uniform chain_pos_z_ptr, | |
__attribute__((aligned(32))) uniform float *uniform prev_chain_pos_x_ptr, | |
__attribute__((aligned(32))) uniform float *uniform prev_chain_pos_y_ptr, | |
__attribute__((aligned(32))) uniform float *uniform prev_chain_pos_z_ptr, | |
__attribute__((aligned(32))) uniform float *uniform prev_chain_quat_x_ptr, | |
__attribute__((aligned(32))) uniform float *uniform prev_chain_quat_y_ptr, | |
__attribute__((aligned(32))) uniform float *uniform prev_chain_quat_z_ptr, | |
__attribute__((aligned(32))) uniform float *uniform prev_chain_quat_w_ptr, | |
__attribute__((aligned(32))) uniform float *uniform p_pos_x_ptr, | |
__attribute__((aligned(32))) uniform float *uniform p_pos_y_ptr, | |
__attribute__((aligned(32))) uniform float *uniform p_pos_z_ptr, | |
__attribute__((aligned(32))) uniform float *uniform inv_mass_ptr, | |
__attribute__((aligned(32))) uniform float *uniform prev_p_pos_x_ptr, | |
__attribute__((aligned(32))) uniform float *uniform prev_p_pos_y_ptr, | |
__attribute__((aligned(32))) uniform float *uniform prev_p_pos_z_ptr, | |
__attribute__((aligned(32))) uniform float *uniform rest_lengths_ptr, | |
__attribute__((aligned(32))) uniform struct chain_cfg *uniform chain_configs_ptr, // Passed as pointer to array of structs | |
__attribute__((aligned(32))) uniform float *uniform sphere_x_ptr, | |
__attribute__((aligned(32))) uniform float *uniform sphere_y_ptr, | |
__attribute__((aligned(32))) uniform float *uniform sphere_z_ptr, | |
__attribute__((aligned(32))) uniform float *uniform sphere_radius_ptr, | |
__attribute__((aligned(32))) uniform float *uniform rest_pos_x_ptr, | |
__attribute__((aligned(32))) uniform float *uniform rest_pos_y_ptr, | |
__attribute__((aligned(32))) uniform float *uniform rest_pos_z_ptr, | |
__attribute__((aligned(32))) uniform float *uniform motion_radius_ptr, | |
uniform float dt) { | |
// Common SIMD constants (varying for vector operations across lanes) | |
varying float dt_vec = dt; | |
varying float dt2_vec = dt * dt; | |
varying float one_vec = 1.0; | |
varying float neg_one_vec = -1.0; | |
varying float eps_vec = 1e-6; | |
varying float inv_mass_clamp_min = 0.0; | |
varying float inv_mass_clamp_max = 1e6; | |
varying float zero_vec = 0.0; | |
varying float inv_dt_vec = 1.0 / dt; | |
varying float two_vec = 2.0; | |
varying float inertia_clamp_min = 0.0; | |
varying float inertia_clamp_max = 1.0; | |
assume(MAX_CHAINS % programCount == 0); | |
assume(PARTICLES_PER_CHAIN % programCount == 0); | |
assume(SPHERES_PER_CHAIN % programCount == 0); | |
assume(MAX_SPHERES % programCount == 0); | |
assume(CONSTRAINTS_PER_CHAIN % programCount == 0); | |
assume(MAX_REST_LENGTHS % programCount == 0); | |
// Local arrays for PBD solver | |
__attribute__((aligned(32))) uniform float px_local[PARTICLES_PER_CHAIN + 1]; | |
__attribute__((aligned(32))) uniform float py_local[PARTICLES_PER_CHAIN + 1]; | |
__attribute__((aligned(32))) uniform float pz_local[PARTICLES_PER_CHAIN + 1]; | |
__attribute__((aligned(32))) uniform float pm_local[PARTICLES_PER_CHAIN + 1]; | |
// All lanes can safely write to these uniform locations. | |
px_local[PARTICLES_PER_CHAIN] = 0.0; | |
py_local[PARTICLES_PER_CHAIN] = 0.0; | |
pz_local[PARTICLES_PER_CHAIN] = 0.0; | |
pm_local[PARTICLES_PER_CHAIN] = 0.0; | |
// Stack arrays for precomputed data | |
__attribute__((aligned(32))) uniform float wind_local_x[MAX_CHAINS]; | |
__attribute__((aligned(32))) uniform float wind_local_y[MAX_CHAINS]; | |
__attribute__((aligned(32))) uniform float wind_local_z[MAX_CHAINS]; | |
__attribute__((aligned(32))) uniform float gravity_local_x[MAX_CHAINS]; | |
__attribute__((aligned(32))) uniform float gravity_local_y[MAX_CHAINS]; | |
__attribute__((aligned(32))) uniform float gravity_local_z[MAX_CHAINS]; | |
__attribute__((aligned(32))) uniform float vel_x[MAX_CHAINS]; | |
__attribute__((aligned(32))) uniform float vel_y[MAX_CHAINS]; | |
__attribute__((aligned(32))) uniform float vel_z[MAX_CHAINS]; | |
__attribute__((aligned(32))) uniform float ang_vel_x[MAX_CHAINS]; | |
__attribute__((aligned(32))) uniform float ang_vel_y[MAX_CHAINS]; | |
__attribute__((aligned(32))) uniform float ang_vel_z[MAX_CHAINS]; | |
// Step 0: Precomputation (SIMD, process chains in groups of programCount) | |
for (uniform int c = 0; c < MAX_CHAINS; c += programCount) { | |
// Load chain quaternions | |
varying float qx = chain_quat_x_ptr[c + programIndex]; | |
varying float qy = chain_quat_y_ptr[c + programIndex]; | |
varying float qz = chain_quat_z_ptr[c + programIndex]; | |
varying float qw = chain_quat_w_ptr[c + programIndex]; | |
// Compute local-space wind | |
{ | |
varying float vx = wind_x_ptr[c + programIndex]; | |
varying float vy = wind_y_ptr[c + programIndex]; | |
varying float vz = wind_z_ptr[c + programIndex]; | |
// Create q_conjugate components | |
varying float cqx = -qx; | |
varying float cqy = -qy; | |
varying float cqz = -qz; | |
// Step 1: t = q_conj * v_world_as_quat | |
varying float tx = qw * vx + cqy * vz - cqz * vy; | |
varying float ty = qw * vy + cqz * vx - cqx * vz; | |
varying float tz = qw * vz + cqx * vy - cqy * vx; | |
varying float tw = qx * vx + qy * vy + qz * vz; | |
// Step 2: result_vec = (t * q)_vec | |
varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy; | |
varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz; | |
varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx; | |
wind_local_x[c + programIndex] = result_x; | |
wind_local_y[c + programIndex] = result_y; | |
wind_local_z[c + programIndex] = result_z; | |
} | |
// Compute local-space gravity | |
{ | |
varying float vx = gravity_x_ptr[c + programIndex]; | |
varying float float_vy = gravity_y_ptr[c + programIndex]; | |
varying float vz = gravity_z_ptr[c + programIndex]; | |
varying float cqx = -qx; | |
varying float cqy = -qy; | |
varying float cqz = -qz; | |
varying float tx = qw * vx + cqy * vz - cqz * float_vy; | |
varying float ty = qw * float_vy + cqz * vx - cqx * vz; | |
varying float tz = qw * vz + cqx * float_vy - cqy * vx; | |
varying float tw = qx * vx + qy * float_vy + qz * vz; | |
// Corrected quaternion multiplication for gravity_local | |
varying float result_x = tw * qx + tx * qw + ty * qz - tz * qy; | |
varying float result_y = tw * qy + ty * qw + tz * qx - tx * qz; | |
varying float result_z = tw * qz + tz * qw + tx * qy - ty * qx; | |
gravity_local_x[c + programIndex] = result_x; | |
gravity_local_y[c + programIndex] = result_y; | |
gravity_local_z[c + programIndex] = result_z; | |
} | |
// Compute linear velocity | |
{ | |
varying float curr_x = chain_pos_x_ptr[c + programIndex]; | |
varying float curr_y = chain_pos_y_ptr[c + programIndex]; | |
varying float curr_z = chain_pos_z_ptr[c + programIndex]; | |
varying float prev_x = prev_chain_pos_x_ptr[c + programIndex]; | |
varying float prev_y = prev_chain_pos_y_ptr[c + programIndex]; | |
varying float prev_z = prev_chain_pos_z_ptr[c + programIndex]; | |
varying float vel_x_vec = (curr_x - prev_x) * inv_dt_vec; | |
varying float vel_y_vec = (curr_y - prev_y) * inv_dt_vec; | |
varying float vel_z_vec = (curr_z - prev_z) * inv_dt_vec; | |
vel_x[c + programIndex] = vel_x_vec; | |
vel_y[c + programIndex] = vel_y_vec; | |
vel_z[c + programIndex] = vel_z_vec; | |
} | |
// Compute angular velocity | |
{ | |
varying float prev_qx = prev_chain_quat_x_ptr[c + programIndex]; | |
varying float prev_qy = prev_chain_quat_y_ptr[c + programIndex]; | |
varying float prev_qz = prev_chain_quat_z_ptr[c + programIndex]; | |
varying float prev_qw = prev_chain_quat_w_ptr[c + programIndex]; | |
// Step 1: Compute delta quaternion (to - from) - Note: This is an approximation | |
varying float delta_qx = qx - prev_qx; | |
varying float delta_qy = qy - prev_qy; | |
varying float delta_qz = qz - prev_qz; | |
varying float delta_qw = qw - prev_qw; | |
// Step 2: Compute conjugate of previous quaternion | |
varying float conj_qx = -prev_qx; | |
varying float conj_qy = -prev_qy; | |
varying float conj_qz = -prev_qz; | |
varying float conj_qw = prev_qw; | |
// Step 3: Compute orientation delta = deltaQuat * conj(prev_quat) | |
varying float orient_qw = delta_qw * conj_qw - delta_qx * conj_qx - delta_qy * conj_qy - delta_qz * conj_qz; | |
varying float cross_x = delta_qy * conj_qz - delta_qz * conj_qy; | |
varying float cross_y = delta_qz * conj_qx - delta_qx * conj_qz; | |
varying float cross_z = delta_qx * conj_qy - delta_qy * conj_qx; | |
varying float orient_qx = delta_qw * conj_qx + delta_qx * conj_qw + cross_x; | |
varying float orient_qy = delta_qw * conj_qy + delta_qy * conj_qw + cross_y; | |
varying float orient_qz = delta_qw * conj_qz + delta_qz * conj_qw + cross_z; | |
// Step 4: Compute dot product (to, from) for shortest arc check | |
varying float dot = qw * prev_qw + qx * prev_qx + qy * prev_qy + qz * prev_qz; | |
// Step 5: Negate orientation delta if dot < 0 | |
varying float negate_sign = select(dot < zero_vec, neg_one_vec, one_vec); | |
orient_qx *= negate_sign; | |
orient_qy *= negate_sign; | |
orient_qz *= negate_sign; | |
// Step 6: Compute angular velocity: (orient_qx, orient_qy, orient_qz) * (1 / (2 * dt)) | |
varying float scale = one_vec / (two_vec * dt_vec); | |
varying float ang_vel_x_vec = orient_qx * scale; | |
varying float ang_vel_y_vec = orient_qy * scale; | |
varying float ang_vel_z_vec = orient_qz * scale; | |
ang_vel_x[c + programIndex] = ang_vel_x_vec; | |
ang_vel_y[c + programIndex] = ang_vel_y_vec; | |
ang_vel_z[c + programIndex] = ang_vel_z_vec; | |
} | |
} | |
// Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) | |
for (uniform int c = 0; c < MAX_CHAINS; c += 1) { | |
uniform int base_idx = c * PARTICLES_PER_CHAIN; | |
uniform struct chain_cfg *cfg = &chain_configs_ptr[c]; | |
// Load velocities and inertia (scalar, broadcast to lanes) | |
varying float vel_x_vec = vel_x[c]; | |
varying float vel_y_vec = vel_y[c]; | |
varying float vel_z_vec = vel_z[c]; | |
varying float ang_vel_x_vec = ang_vel_x[c]; | |
varying float ang_vel_y_vec = ang_vel_y[c]; | |
varying float ang_vel_z_vec = ang_vel_z[c]; | |
varying float linear_inertia_vec = cfg->linear_inertia; | |
varying float angular_inertia_vec = cfg->angular_inertia; | |
varying float centrifugal_inertia_vec = cfg->centrifugal_inertia; | |
// Clamp inertia parameters to [0, 1] | |
linear_inertia_vec = clamp(linear_inertia_vec, inertia_clamp_min, inertia_clamp_max); | |
angular_inertia_vec = clamp(angular_inertia_vec, inertia_clamp_min, inertia_clamp_max); | |
centrifugal_inertia_vec = clamp(centrifugal_inertia_vec, inertia_clamp_min, inertia_clamp_max); | |
// Process particles in groups | |
for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { | |
varying float px = p_pos_x_ptr[base_idx + i + programIndex]; | |
varying float py = p_pos_y_ptr[base_idx + i + programIndex]; | |
varying float pz = p_pos_z_ptr[base_idx + i + programIndex]; | |
varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex]; | |
p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max); | |
// Linear inertia: v * dt * linear_inertia | |
varying float lin_delta_x = vel_x_vec * dt_vec * linear_inertia_vec; | |
varying float lin_delta_y = vel_y_vec * dt_vec * linear_inertia_vec; | |
varying float lin_delta_z = vel_z_vec * dt_vec * linear_inertia_vec; | |
// Angular inertia: (ω × r) * dt * angular_inertia | |
varying float ang_delta_x = ang_vel_y_vec * pz - ang_vel_z_vec * py; | |
varying float ang_delta_y = ang_vel_z_vec * px - ang_vel_x_vec * pz; | |
varying float ang_delta_z = ang_vel_x_vec * py - ang_vel_y_vec * px; | |
ang_delta_x *= dt_vec * angular_inertia_vec; | |
ang_delta_y *= dt_vec * angular_inertia_vec; | |
ang_delta_z *= dt_vec * angular_inertia_vec; | |
// Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia | |
varying float cross1_x = ang_vel_y_vec * pz - ang_vel_z_vec * py; | |
varying float cross1_y = ang_vel_z_vec * px - ang_vel_x_vec * pz; | |
varying float cross1_z = ang_vel_x_vec * py - ang_vel_y_vec * px; | |
varying float cross2_x = ang_vel_y_vec * cross1_z - ang_vel_z_vec * cross1_y; | |
varying float cross2_y = ang_vel_z_vec * cross1_x - ang_vel_x_vec * cross1_z; | |
varying float cross2_z = ang_vel_x_vec * cross1_y - ang_vel_y_vec * cross1_x; | |
varying float cent_delta_x = cross2_x * dt2_vec * centrifugal_inertia_vec; | |
varying float cent_delta_y = cross2_y * dt2_vec * centrifugal_inertia_vec; | |
varying float cent_delta_z = cross2_z * dt2_vec * centrifugal_inertia_vec; | |
// Total delta | |
varying float delta_x = lin_delta_x + ang_delta_x + cent_delta_x; | |
varying float delta_y = lin_delta_y + ang_delta_y + cent_delta_y; | |
varying float delta_z = lin_delta_z + ang_delta_z + cent_delta_z; | |
// Update positions | |
p_pos_x_ptr[base_idx + i + programIndex] = px + delta_x; | |
p_pos_y_ptr[base_idx + i + programIndex] = py + delta_y; | |
p_pos_z_ptr[base_idx + i + programIndex] = pz + delta_z; | |
} | |
// Update previous transformation (scalar) | |
prev_chain_pos_x_ptr[c] = chain_pos_x_ptr[c]; | |
prev_chain_pos_y_ptr[c] = chain_pos_y_ptr[c]; | |
prev_chain_pos_z_ptr[c] = chain_pos_z_ptr[c]; | |
prev_chain_quat_x_ptr[c] = chain_quat_x_ptr[c]; | |
prev_chain_quat_y_ptr[c] = chain_quat_y_ptr[c]; | |
prev_chain_quat_z_ptr[c] = chain_quat_z_ptr[c]; | |
prev_chain_quat_w_ptr[c] = chain_quat_w_ptr[c]; | |
} | |
// Step 2: Verlet Integration | |
for (uniform int c = 0; c < MAX_CHAINS; c += 1) { | |
uniform int base_idx = c * PARTICLES_PER_CHAIN; | |
uniform struct chain_cfg *cfg = &chain_configs_ptr[c]; | |
// Load forces and damping (scalar, broadcast to lanes) | |
varying float gravity_x_vec = gravity_local_x[c]; | |
varying float gravity_y_vec = gravity_local_y[c]; | |
varying float gravity_z_vec = gravity_local_z[c]; | |
varying float wind_x_vec = wind_local_x[c]; | |
varying float wind_y_vec = wind_local_y[c]; | |
varying float wind_z_vec = wind_local_z[c]; | |
varying float damping_vec = cfg->damping; | |
for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { | |
varying float p_curr_x = p_pos_x_ptr[base_idx + i + programIndex]; | |
varying float p_curr_y = p_pos_y_ptr[base_idx + i + programIndex]; | |
varying float p_curr_z = p_pos_z_ptr[base_idx + i + programIndex]; | |
varying float p_inv_mass = inv_mass_ptr[base_idx + i + programIndex]; | |
p_inv_mass = clamp(p_inv_mass, inv_mass_clamp_min, inv_mass_clamp_max); | |
varying float p_prev_x = prev_p_pos_x_ptr[base_idx + i + programIndex]; | |
varying float p_prev_y = prev_p_pos_y_ptr[base_idx + i + programIndex]; | |
varying float p_prev_z = prev_p_pos_z_ptr[base_idx + i + programIndex]; | |
// Compute forces and accelerations | |
varying float force_x = gravity_x_vec + wind_x_vec; | |
varying float force_y = gravity_y_vec + wind_y_vec; | |
varying float force_z = gravity_z_vec + wind_z_vec; | |
varying float acc_x = force_x * p_inv_mass; | |
varying float acc_y = force_y * p_inv_mass; | |
varying float acc_z = force_z * p_inv_mass; | |
// Compute velocity with damping | |
varying float v_x = p_curr_x - p_prev_x; | |
varying float v_y = p_curr_y - p_prev_y; | |
varying float v_z = p_curr_z - p_prev_z; | |
varying float damped_vel_x = v_x * damping_vec; | |
varying float damped_vel_y = v_y * damping_vec; | |
varying float damped_vel_z = v_z * damping_vec; | |
// Update positions: p_curr + damped_vel + acc * dt^2 | |
varying float term_accel_x = acc_x * dt2_vec; | |
varying float term_accel_y = acc_y * dt2_vec; | |
varying float term_accel_z = acc_z * dt2_vec; | |
p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x + damped_vel_x + term_accel_x; | |
p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y + damped_vel_y + term_accel_y; | |
p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z + damped_vel_z + term_accel_z; | |
// Update previous positions | |
prev_p_pos_x_ptr[base_idx + i + programIndex] = p_curr_x; | |
prev_p_pos_y_ptr[base_idx + i + programIndex] = p_curr_y; | |
prev_p_pos_z_ptr[base_idx + i + programIndex] = p_curr_z; | |
} | |
} | |
// Step 3: Distance Constraints | |
for (uniform int c = 0; c < MAX_CHAINS; c += 1) { | |
uniform int p_base = c * PARTICLES_PER_CHAIN; | |
uniform int r_base = c * CONSTRAINTS_PER_CHAIN; | |
uniform struct chain_cfg *cfg = &chain_configs_ptr[c]; | |
varying float stiffness_vec = cfg->stiffness; | |
varying float stretch_factor_vec = cfg->stretch_factor; | |
varying float max_strain_vec_abs = cfg->max_strain; | |
varying float neg_max_strain_vec_abs = max_strain_vec_abs * neg_one_vec; | |
// Copy particles to local arrays with uniform indexing | |
// This loop now performs a vectorized load into the uniform px_local array. | |
for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { | |
// Load a vector of particles from global memory into varying registers | |
varying float current_px = p_pos_x_ptr[p_base + i + programIndex]; | |
varying float current_py = p_pos_y_ptr[p_base + i + programIndex]; | |
varying float current_pz = p_pos_z_ptr[p_base + i + programIndex]; | |
varying float current_pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max); | |
// Store these varying values into the uniform local array | |
// The compiler can now see this as a series of vector stores | |
// because px_local is uniform. | |
foreach_active (idx) { | |
px_local[i + programIndex] = current_px; | |
py_local[i + programIndex] = current_py; | |
pz_local[i + programIndex] = current_pz; | |
pm_local[i + programIndex] = current_pm; | |
} | |
} | |
// Iterate over constraints | |
for (uniform int iter = 0; iter < MAX_ITERATIONS; iter++) { | |
// Process constraints in groups of programCount (0 to PARTICLES_PER_CHAIN-2) | |
for (uniform int i = 0; i < PARTICLES_PER_CHAIN - 1; i += programCount) { | |
// Now, load directly from uniform px_local using varying programIndex | |
// The compiler can optimize these into vector loads. | |
varying float p0x = px_local[i + programIndex]; | |
varying float p0y = py_local[i + programIndex]; | |
varying float p0z = pz_local[i + programIndex]; | |
varying float p0m = pm_local[i + programIndex]; | |
// For p1, we need to be careful with the indexing relative to programIndex. | |
// The loop condition 'i < PARTICLES_PER_CHAIN - 1' ensures that 'i + programIndex + 1' | |
// will not exceed 'PARTICLES_PER_CHAIN - 1' for any active lane. | |
varying float p1x = px_local[i + programIndex + 1]; | |
varying float p1y = py_local[i + programIndex + 1]; | |
varying float p1z = pz_local[i + programIndex + 1]; | |
varying float p1m = pm_local[i + programIndex + 1]; | |
varying float rest_len_vec = rest_lengths_ptr[r_base + i + programIndex]; | |
varying float dx = p1x - p0x; | |
varying float dy = p1y - p0y; | |
varying float dz = p1z - p0z; | |
varying float dist_sq = dx * dx + dy * dy + dz * dz; | |
varying float inv_dist = rsqrt(max(dist_sq, eps_vec)); | |
varying float dist = rcp(inv_dist); | |
varying float w_sum = p0m + p1m; | |
varying float rcp_w_sum = rcp(max(w_sum, eps_vec)); | |
varying float diff = dist - rest_len_vec; | |
varying float strain = diff * rcp(max(rest_len_vec, eps_vec)); | |
varying float strain_clamped = clamp(strain, neg_max_strain_vec_abs, max_strain_vec_abs); | |
varying float dynamic_stiffness = stiffness_vec * rcp(one_vec + abs(strain_clamped)); | |
varying float corr_magnitude = diff * dynamic_stiffness * stretch_factor_vec * rcp_w_sum; | |
varying float apply_corr_mask = select(dist_sq > eps_vec, 1.0, 0.0); | |
// Mask lanes where i + programIndex >= PARTICLES_PER_CHAIN - 1 (no constraint for last particle) | |
apply_corr_mask = select(i + programIndex < PARTICLES_PER_CHAIN - 1, apply_corr_mask, 0.0); | |
varying float delta_x = (dx * inv_dist) * corr_magnitude * apply_corr_mask; | |
varying float delta_y = (dy * inv_dist) * corr_magnitude * apply_corr_mask; | |
varying float delta_z = (dz * inv_dist) * corr_magnitude * apply_corr_mask; | |
// Apply corrections, writing back to uniform px_local | |
// These are now vector stores. | |
px_local[i + programIndex] -= delta_x * p0m; | |
py_local[i + programIndex] -= delta_y * p0m; | |
pz_local[i + programIndex] -= delta_z * p0m; | |
px_local[i + programIndex + 1] += delta_x * p1m; | |
py_local[i + programIndex + 1] += delta_y * p1m; | |
pz_local[i + programIndex + 1] += delta_z * p1m; | |
} | |
} | |
// Write back to global arrays | |
for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { | |
// Load a vector of particles from uniform local array into varying registers | |
varying float updated_px = px_local[i + programIndex]; | |
varying float updated_py = py_local[i + programIndex]; | |
varying float updated_pz = pz_local[i + programIndex]; | |
// Store these varying values into the global memory | |
foreach_active (idx) { | |
p_pos_x_ptr[p_base + i + programIndex] = updated_px; | |
p_pos_y_ptr[p_base + i + programIndex] = updated_py; | |
p_pos_z_ptr[p_base + i + programIndex] = updated_pz; | |
} | |
} | |
} | |
// Step 4: Sphere Collisions with Friction | |
for (uniform int c = 0; c < MAX_CHAINS; c += 1) { | |
uniform int p_base = c * PARTICLES_PER_CHAIN; | |
uniform int s_base = c * SPHERES_PER_CHAIN; | |
uniform struct chain_cfg *cfg = &chain_configs_ptr[c]; | |
varying float friction_vec = cfg->friction; | |
for (uniform int s = 0; s < SPHERES_PER_CHAIN; s += 1) { | |
varying float sphere_x_vec = sphere_x_ptr[s_base + s]; | |
varying float sphere_y_vec = sphere_y_ptr[s_base + s]; | |
varying float sphere_z_vec = sphere_z_ptr[s_base + s]; | |
varying float sphere_r_vec = sphere_radius_ptr[s_base + s]; | |
for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { | |
varying float p_curr_x = p_pos_x_ptr[p_base + i + programIndex]; | |
varying float p_curr_y = p_pos_y_ptr[p_base + i + programIndex]; | |
varying float p_curr_z = p_pos_z_ptr[p_base + i + programIndex]; | |
varying float p_prev_x = prev_p_pos_x_ptr[p_base + i + programIndex]; | |
varying float p_prev_y = prev_p_pos_y_ptr[p_base + i + programIndex]; | |
varying float p_prev_z = prev_p_pos_z_ptr[p_base + i + programIndex]; | |
varying float p_inv_mass = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max); | |
varying float dx = p_curr_x - sphere_x_vec; | |
varying float dy = p_curr_y - sphere_y_vec; | |
varying float dz = p_curr_z - sphere_z_vec; | |
varying float dist_sq = dx * dx + dy * dy + dz * dz; | |
varying float inv_dist = rsqrt(max(dist_sq, eps_vec)); | |
varying float dist = rcp(inv_dist); | |
varying float penetration = sphere_r_vec - dist; | |
varying float collision_mask = select(penetration > zero_vec, 1.0, 0.0); | |
varying float normal_x = dx * inv_dist; | |
varying float normal_y = dy * inv_dist; | |
varying float normal_z = dz * inv_dist; | |
varying float normal_corr_x = normal_x * penetration; | |
varying float normal_corr_y = normal_y * penetration; | |
varying float normal_corr_z = normal_z * penetration; | |
varying float v_x = p_curr_x - p_prev_x; | |
varying float v_y = p_curr_y - p_prev_y; | |
varying float v_z = p_curr_z - p_prev_z; | |
varying float vel_dot_normal = v_x * normal_x + v_y * normal_y + v_z * normal_z; | |
varying float tangent_x = v_x - vel_dot_normal * normal_x; | |
varying float tangent_y = v_y - vel_dot_normal * normal_y; | |
varying float tangent_z = v_z - vel_dot_normal * normal_z; | |
varying float tangent_mag_sq = tangent_x * tangent_x + tangent_y * tangent_y + tangent_z * tangent_z; | |
varying float inv_tangent_mag = rsqrt(max(tangent_mag_sq, eps_vec)); | |
varying float friction_mag = penetration * friction_vec; | |
varying float friction_x = (tangent_x * inv_tangent_mag) * friction_mag; | |
varying float friction_y = (tangent_y * inv_tangent_mag) * friction_mag; | |
varying float friction_z = (tangent_z * inv_tangent_mag) * friction_mag; | |
varying float total_corr_x = (normal_corr_x - friction_x) * collision_mask * p_inv_mass; | |
varying float total_corr_y = (normal_corr_y - friction_y) * collision_mask * p_inv_mass; | |
varying float total_corr_z = (normal_corr_z - friction_z) * collision_mask * p_inv_mass; | |
p_pos_x_ptr[p_base + i + programIndex] = p_curr_x + total_corr_x; | |
p_pos_y_ptr[p_base + i + programIndex] = p_curr_y + total_corr_y; | |
p_pos_z_ptr[p_base + i + programIndex] = p_curr_z + total_corr_z; | |
} | |
} | |
} | |
// Step 5: Motion Constraints | |
for (uniform int c = 0; c < MAX_CHAINS; c += 1) { | |
uniform int p_base = c * PARTICLES_PER_CHAIN; | |
for (uniform int i = 0; i < PARTICLES_PER_CHAIN; i += programCount) { | |
varying float px = p_pos_x_ptr[p_base + i + programIndex]; | |
varying float py = p_pos_y_ptr[p_base + i + programIndex]; | |
varying float pz = p_pos_z_ptr[p_base + i + programIndex]; | |
varying float pm = clamp(inv_mass_ptr[p_base + i + programIndex], inv_mass_clamp_min, inv_mass_clamp_max); | |
varying float rx = rest_pos_x_ptr[p_base + i + programIndex]; | |
varying float ry = rest_pos_y_ptr[p_base + i + programIndex]; | |
varying float rz = rest_pos_z_ptr[p_base + i + programIndex]; | |
varying float radius_vec = motion_radius_ptr[p_base + i + programIndex]; | |
varying float dx = px - rx; | |
varying float dy = py - ry; | |
varying float dz = pz - rz; | |
varying float dist_sq = dx * dx + dy * dy + dz * dz; | |
varying float inv_dist = rsqrt(max(dist_sq, eps_vec)); | |
varying float dist = rcp(inv_dist); | |
varying float penetration = dist - radius_vec; | |
varying float mask = select(penetration > zero_vec, 1.0, 0.0); | |
varying float corr_factor = penetration * inv_dist * mask; | |
varying float delta_x = dx * corr_factor * pm; | |
varying float delta_y = dy * corr_factor * pm; | |
varying float delta_z = dz * corr_factor * pm; | |
p_pos_x_ptr[p_base + i + programIndex] = px - delta_x; | |
p_pos_y_ptr[p_base + i + programIndex] = py - delta_y; | |
p_pos_z_ptr[p_base + i + programIndex] = pz - delta_z; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment