Created
April 26, 2020 16:53
-
-
Save castano/8fff1eed89eb1bb49a132c92f5e62879 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Wide8 besterror8 = broadcast8(FLT_MAX); | |
Vector3_Wide8 beststart8; | |
Vector3_Wide8 bestend8; | |
// check all possible clusters for this total order | |
for (int i = 0; i < s_fourClusterTotal[count - 1]; i += 8) | |
{ | |
//uint c0 = s_fourCluster[i].c0; | |
//uint c1 = s_fourCluster[i].c1; | |
//uint c2 = s_fourCluster[i].c2; | |
// Load 4 uint8 per lane. | |
//__m256i packedClusterIndex = _mm256_load_si256((__m256i *)&s_fourCluster[i]); | |
Vector3_Wide8 x0;// = x_sat[c0]; | |
Wide8 w0;// = w_sat[c0]; | |
Vector3_Wide8 x1;// = x_sat[c1]; | |
Wide8 w1;// = w_sat[c1]; | |
Vector3_Wide8 x2;// = x_sat[c2]; | |
Wide8 w2;// = w_sat[c2]; | |
// @@ Is there a better way to do this? | |
for (int l = 0; l < 8; l++) { | |
uint c0 = s_fourCluster[i+l].c0; | |
uint c1 = s_fourCluster[i+l].c1; | |
uint c2 = s_fourCluster[i+l].c2; | |
x0.x.e[l] = x_sat[c0].x; | |
x0.y.e[l] = x_sat[c0].y; | |
x0.z.e[l] = x_sat[c0].z; | |
w0.e[l] = w_sat[c0]; | |
x1.x.e[l] = x_sat[c1].x; | |
x1.y.e[l] = x_sat[c1].y; | |
x1.z.e[l] = x_sat[c1].z; | |
w1.e[l] = w_sat[c1]; | |
x2.x.e[l] = x_sat[c2].x; | |
x2.y.e[l] = x_sat[c2].y; | |
x2.z.e[l] = x_sat[c2].z; | |
w2.e[l] = w_sat[c2]; | |
} | |
x2 = x2 - x1; | |
x1 = x1 - x0; | |
w2 = w2 - w1; | |
w1 = w1 - w0; | |
Wide8 w3 = broadcast8(m_wsum) - w0 - w1 - w2; | |
Wide8 alpha2_sum = mad8(w2, broadcast8(1.0f / 9.0f), mad8(w1, broadcast8(4.0f/ 9.0f), w0)); | |
Wide8 beta2_sum = mad8(w1, broadcast8(1.0f / 9.0f), mad8(w2, broadcast8(4.0f / 9.0f), w3)); | |
Wide8 alphabeta_sum = (w1 + w2) * broadcast8(2.0f / 9.0f); | |
Wide8 factor = rcp8(alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); | |
Vector3_Wide8 alphax_sum = mad8(x2, broadcast8(1.0f / 3.0f), mad8(x1, broadcast8(2.0f / 3.0f), x0)); | |
Vector3_Wide8 betax_sum = broadcast8(m_xsum) - alphax_sum; | |
Vector3_Wide8 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor; | |
Vector3_Wide8 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor; | |
// clamp to the grid | |
a = saturate8(a); | |
b = saturate8(b); | |
a = round_ept8(a); | |
b = round_ept8(b); | |
// compute the error @@ Use fma here. | |
Vector3_Wide8 e1 = a * a * alpha2_sum + b * b * beta2_sum + (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum) * broadcast8(2.0f); | |
// apply the metric to the error term | |
//Wide8 error = dot8(e1, broadcast8(m_metricSqr)); | |
Wide8 error = e1.x + e1.y + e1.z; | |
// keep the solution if it wins | |
auto mask = ge8(besterror8, error); | |
besterror8 = select8(mask, besterror8, error); // @@ Use min? | |
beststart8 = select8(mask, beststart8, a); | |
bestend8 = select8(mask, bestend8, b); | |
} | |
// Is there a better way to do this reduction? | |
int bestindex; | |
for (int i = 0; i < 8; i++) { | |
if (besterror8.e[i]< besterror) { | |
besterror = besterror8.e[i]; | |
bestindex = i; | |
} | |
} | |
beststart.x = beststart8.x.e[bestindex]; | |
beststart.y = beststart8.y.e[bestindex]; | |
beststart.z = beststart8.z.e[bestindex]; | |
bestend.x = bestend8.x.e[bestindex]; | |
bestend.y = bestend8.y.e[bestindex]; | |
bestend.z = bestend8.z.e[bestindex]; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment