Skip to content

Commit 9a09b6b

Browse files
committed
lj speed up
1 parent 2f49485 commit 9a09b6b

2 files changed

Lines changed: 34 additions & 32 deletions

File tree

src/force/lj.cu

Lines changed: 31 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ The class dealing with the Lennard-Jones (LJ) pairwise potentials.
2626
// best block size here: 128
2727
#define BLOCK_SIZE_FORCE 128
2828

29+
#define USE_FIXED_NEIGHBOR
30+
2931
LJ::LJ(FILE* fid, int num_types, int num_atoms)
3032
{
3133
printf("Use %d-element LJ potential with elements:\n", num_types);
@@ -69,11 +71,11 @@ LJ::~LJ(void)
6971

7072
// get U_ij and (d U_ij / d r_ij) / r_ij (the LJ potential)
7173
static __device__ void
72-
find_p2_and_f2(double s6e4, double s12e4, double d12sq, double& p2, double& f2)
74+
find_p2_and_f2(float s6e4, float s12e4, float d12sq, float& p2, float& f2)
7375
{
74-
double d12inv2 = 1.0 / d12sq;
75-
double d12inv6 = d12inv2 * d12inv2 * d12inv2;
76-
f2 = 6.0 * (s6e4 * d12inv6 - s12e4 * 2.0 * d12inv6 * d12inv6) * d12inv2;
76+
float d12inv2 = 1.0f / d12sq;
77+
float d12inv6 = d12inv2 * d12inv2 * d12inv2;
78+
f2 = 6.0f * (s6e4 * d12inv6 - s12e4 * 2.0f * d12inv6 * d12inv6) * d12inv2;
7779
p2 = s12e4 * d12inv6 * d12inv6 - s6e4 * d12inv6;
7880
}
7981

@@ -97,19 +99,19 @@ static __global__ void gpu_find_force(
9799
double* g_potential)
98100
{
99101
int n1 = blockIdx.x * blockDim.x + threadIdx.x + N1; // particle index
100-
double s_fx = 0.0; // force_x
101-
double s_fy = 0.0; // force_y
102-
double s_fz = 0.0; // force_z
103-
double s_pe = 0.0; // potential energy
104-
double s_sxx = 0.0; // virial_stress_xx
105-
double s_sxy = 0.0; // virial_stress_xy
106-
double s_sxz = 0.0; // virial_stress_xz
107-
double s_syx = 0.0; // virial_stress_yx
108-
double s_syy = 0.0; // virial_stress_yy
109-
double s_syz = 0.0; // virial_stress_yz
110-
double s_szx = 0.0; // virial_stress_zx
111-
double s_szy = 0.0; // virial_stress_zy
112-
double s_szz = 0.0; // virial_stress_zz
102+
float s_fx = 0.0f; // force_x
103+
float s_fy = 0.0f; // force_y
104+
float s_fz = 0.0f; // force_z
105+
float s_pe = 0.0f; // potential energy
106+
float s_sxx = 0.0f; // virial_stress_xx
107+
float s_sxy = 0.0f; // virial_stress_xy
108+
float s_sxz = 0.0f; // virial_stress_xz
109+
float s_syx = 0.0f; // virial_stress_yx
110+
float s_syy = 0.0f; // virial_stress_yy
111+
float s_syz = 0.0f; // virial_stress_yz
112+
float s_szx = 0.0f; // virial_stress_zx
113+
float s_szy = 0.0f; // virial_stress_zy
114+
float s_szz = 0.0f; // virial_stress_zz
113115

114116
if (n1 < N2) {
115117
int neighbor_number = g_neighbor_number[n1];
@@ -122,33 +124,33 @@ static __global__ void gpu_find_force(
122124
int n2 = g_neighbor_list[n1 + number_of_particles * i1];
123125
int type2 = g_type[n2];
124126

125-
double x12 = g_x[n2] - x1;
126-
double y12 = g_y[n2] - y1;
127-
double z12 = g_z[n2] - z1;
127+
float x12 = g_x[n2] - x1;
128+
float y12 = g_y[n2] - y1;
129+
float z12 = g_z[n2] - z1;
128130
apply_mic(box, x12, y12, z12);
129-
double d12sq = x12 * x12 + y12 * y12 + z12 * z12;
131+
float d12sq = x12 * x12 + y12 * y12 + z12 * z12;
130132

131-
double p2, f2;
133+
float p2, f2;
132134
if (d12sq >= lj.cutoff_square[type1][type2]) {
133135
continue;
134136
}
135137
find_p2_and_f2(lj.s6e4[type1][type2], lj.s12e4[type1][type2], d12sq, p2, f2);
136138

137139
// treat two-body potential in the same way as many-body potential
138-
double f12x = f2 * x12 * 0.5;
139-
double f12y = f2 * y12 * 0.5;
140-
double f12z = f2 * z12 * 0.5;
141-
double f21x = -f12x;
142-
double f21y = -f12y;
143-
double f21z = -f12z;
140+
float f12x = f2 * x12 * 0.5f;
141+
float f12y = f2 * y12 * 0.5f;
142+
float f12z = f2 * z12 * 0.5f;
143+
float f21x = -f12x;
144+
float f21y = -f12y;
145+
float f21z = -f12z;
144146

145147
// accumulate force
146148
s_fx += f12x - f21x;
147149
s_fy += f12y - f21y;
148150
s_fz += f12z - f21z;
149151

150152
// accumulate potential energy and virial
151-
s_pe += p2 * 0.5; // two-body potential
153+
s_pe += p2 * 0.5f; // two-body potential
152154
s_sxx += x12 * f21x;
153155
s_sxy += x12 * f21y;
154156
s_sxz += x12 * f21z;

src/force/lj.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@
2222
#define MAX_TYPE 10 // == max number of potentials
2323

2424
struct LJ_Para {
25-
double s6e4[MAX_TYPE][MAX_TYPE];
26-
double s12e4[MAX_TYPE][MAX_TYPE];
27-
double cutoff_square[MAX_TYPE][MAX_TYPE];
25+
float s6e4[MAX_TYPE][MAX_TYPE];
26+
float s12e4[MAX_TYPE][MAX_TYPE];
27+
float cutoff_square[MAX_TYPE][MAX_TYPE];
2828
};
2929

3030
struct LJ_Data {

0 commit comments

Comments
 (0)