WarpX
CurrentDeposition.H
Go to the documentation of this file.
1 /* Copyright 2019 Axel Huebl, David Grote, Maxence Thevenet
2  * Remi Lehe, Weiqun Zhang, Michael Rowan
3  *
4  * This file is part of WarpX.
5  *
6  * License: BSD-3-Clause-LBNL
7  */
8 #ifndef CURRENTDEPOSITION_H_
9 #define CURRENTDEPOSITION_H_
10 
14 #include "Particles/ShapeFactors.H"
15 #include "Utils/TextMsg.H"
17 #include "Utils/WarpXConst.H"
18 #ifdef WARPX_DIM_RZ
19 # include "Utils/WarpX_Complex.H"
20 #endif
21 
22 #include <AMReX.H>
23 #include <AMReX_Arena.H>
24 #include <AMReX_Array4.H>
25 #include <AMReX_REAL.H>
26 
27 using namespace amrex::literals;
28 
53 template <int depos_order>
54 void doDepositionShapeN (const GetParticlePosition& GetPosition,
55  const amrex::ParticleReal * const wp,
56  const amrex::ParticleReal * const uxp,
57  const amrex::ParticleReal * const uyp,
58  const amrex::ParticleReal * const uzp,
59  const int * const ion_lev,
60  amrex::FArrayBox& jx_fab,
61  amrex::FArrayBox& jy_fab,
62  amrex::FArrayBox& jz_fab,
63  const long np_to_depose,
64  const amrex::Real relative_time,
65  const std::array<amrex::Real,3>& dx,
66  const std::array<amrex::Real,3>& xyzmin,
67  const amrex::Dim3 lo,
68  const amrex::Real q,
69  const int n_rz_azimuthal_modes,
70  amrex::Real* cost,
71  const long load_balance_costs_update_algo)
72 {
73 #if !defined(WARPX_DIM_RZ)
74  amrex::ignore_unused(n_rz_azimuthal_modes);
75 #endif
76 
77 #if !defined(AMREX_USE_GPU)
78  amrex::ignore_unused(cost, load_balance_costs_update_algo);
79 #endif
80 
81  // Whether ion_lev is a null pointer (do_ionization=0) or a real pointer
82  // (do_ionization=1)
83  const bool do_ionization = ion_lev;
84  const amrex::Real dzi = 1.0_rt/dx[2];
85 #if defined(WARPX_DIM_1D_Z)
86  const amrex::Real invvol = dzi;
87 #endif
88 #if defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
89  const amrex::Real dxi = 1.0_rt/dx[0];
90  const amrex::Real invvol = dxi*dzi;
91 #elif defined(WARPX_DIM_3D)
92  const amrex::Real dxi = 1.0_rt/dx[0];
93  const amrex::Real dyi = 1.0_rt/dx[1];
94  const amrex::Real invvol = dxi*dyi*dzi;
95 #endif
96 
97 #if (AMREX_SPACEDIM >= 2)
98  const amrex::Real xmin = xyzmin[0];
99 #endif
100 #if defined(WARPX_DIM_3D)
101  const amrex::Real ymin = xyzmin[1];
102 #endif
103  const amrex::Real zmin = xyzmin[2];
104 
105  const amrex::Real clightsq = 1.0_rt/PhysConst::c/PhysConst::c;
106 
107  amrex::Array4<amrex::Real> const& jx_arr = jx_fab.array();
108  amrex::Array4<amrex::Real> const& jy_arr = jy_fab.array();
109  amrex::Array4<amrex::Real> const& jz_arr = jz_fab.array();
110  amrex::IntVect const jx_type = jx_fab.box().type();
111  amrex::IntVect const jy_type = jy_fab.box().type();
112  amrex::IntVect const jz_type = jz_fab.box().type();
113 
114  constexpr int zdir = WARPX_ZINDEX;
115  constexpr int NODE = amrex::IndexType::NODE;
116  constexpr int CELL = amrex::IndexType::CELL;
117 
118  // Loop over particles and deposit into jx_fab, jy_fab and jz_fab
119 #if defined(WARPX_USE_GPUCLOCK)
120  amrex::Real* cost_real = nullptr;
121  if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
122  cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
123  *cost_real = 0._rt;
124  }
125 #endif
127  np_to_depose,
128  [=] AMREX_GPU_DEVICE (long ip) {
129 #if defined(WARPX_USE_GPUCLOCK)
130  const auto KernelTimer = ablastr::parallelization::KernelTimer(
131  cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
132  cost_real);
133 #endif
134 
135  // --- Get particle quantities
136  const amrex::Real gaminv = 1.0_rt/std::sqrt(1.0_rt + uxp[ip]*uxp[ip]*clightsq
137  + uyp[ip]*uyp[ip]*clightsq
138  + uzp[ip]*uzp[ip]*clightsq);
139  amrex::Real wq = q*wp[ip];
140  if (do_ionization){
141  wq *= ion_lev[ip];
142  }
143 
144  amrex::ParticleReal xp, yp, zp;
145  GetPosition(ip, xp, yp, zp);
146 
147  const amrex::Real vx = uxp[ip]*gaminv;
148  const amrex::Real vy = uyp[ip]*gaminv;
149  const amrex::Real vz = uzp[ip]*gaminv;
150  // wqx, wqy wqz are particle current in each direction
151 #if defined(WARPX_DIM_RZ)
152  // In RZ, wqx is actually wqr, and wqy is wqtheta
153  // Convert to cylinderical at the mid point
154  const amrex::Real xpmid = xp + relative_time*vx;
155  const amrex::Real ypmid = yp + relative_time*vy;
156  const amrex::Real rpmid = std::sqrt(xpmid*xpmid + ypmid*ypmid);
157  amrex::Real costheta;
158  amrex::Real sintheta;
159  if (rpmid > 0._rt) {
160  costheta = xpmid/rpmid;
161  sintheta = ypmid/rpmid;
162  } else {
163  costheta = 1._rt;
164  sintheta = 0._rt;
165  }
166  const Complex xy0 = Complex{costheta, sintheta};
167  const amrex::Real wqx = wq*invvol*(+vx*costheta + vy*sintheta);
168  const amrex::Real wqy = wq*invvol*(-vx*sintheta + vy*costheta);
169 #else
170  const amrex::Real wqx = wq*invvol*vx;
171  const amrex::Real wqy = wq*invvol*vy;
172 #endif
173  const amrex::Real wqz = wq*invvol*vz;
174 
175  // --- Compute shape factors
176  Compute_shape_factor< depos_order > const compute_shape_factor;
177 #if (AMREX_SPACEDIM >= 2)
178  // x direction
179  // Get particle position after 1/2 push back in position
180 #if defined(WARPX_DIM_RZ)
181  // Keep these double to avoid bug in single precision
182  const double xmid = (rpmid - xmin)*dxi;
183 #else
184  const double xmid = ((xp - xmin) + relative_time*vx)*dxi;
185 #endif
186  // j_j[xyz] leftmost grid point in x that the particle touches for the centering of each current
187  // sx_j[xyz] shape factor along x for the centering of each current
188  // There are only two possible centerings, node or cell centered, so at most only two shape factor
189  // arrays will be needed.
190  // Keep these double to avoid bug in single precision
191  double sx_node[depos_order + 1] = {0.};
192  double sx_cell[depos_order + 1] = {0.};
193  int j_node = 0;
194  int j_cell = 0;
195  if (jx_type[0] == NODE || jy_type[0] == NODE || jz_type[0] == NODE) {
196  j_node = compute_shape_factor(sx_node, xmid);
197  }
198  if (jx_type[0] == CELL || jy_type[0] == CELL || jz_type[0] == CELL) {
199  j_cell = compute_shape_factor(sx_cell, xmid - 0.5);
200  }
201 
202  amrex::Real sx_jx[depos_order + 1] = {0._rt};
203  amrex::Real sx_jy[depos_order + 1] = {0._rt};
204  amrex::Real sx_jz[depos_order + 1] = {0._rt};
205  for (int ix=0; ix<=depos_order; ix++)
206  {
207  sx_jx[ix] = ((jx_type[0] == NODE) ? amrex::Real(sx_node[ix]) : amrex::Real(sx_cell[ix]));
208  sx_jy[ix] = ((jy_type[0] == NODE) ? amrex::Real(sx_node[ix]) : amrex::Real(sx_cell[ix]));
209  sx_jz[ix] = ((jz_type[0] == NODE) ? amrex::Real(sx_node[ix]) : amrex::Real(sx_cell[ix]));
210  }
211 
212  int const j_jx = ((jx_type[0] == NODE) ? j_node : j_cell);
213  int const j_jy = ((jy_type[0] == NODE) ? j_node : j_cell);
214  int const j_jz = ((jz_type[0] == NODE) ? j_node : j_cell);
215 #endif //AMREX_SPACEDIM >= 2
216 
217 #if defined(WARPX_DIM_3D)
218  // y direction
219  // Keep these double to avoid bug in single precision
220  const double ymid = ((yp - ymin) + relative_time*vy)*dyi;
221  double sy_node[depos_order + 1] = {0.};
222  double sy_cell[depos_order + 1] = {0.};
223  int k_node = 0;
224  int k_cell = 0;
225  if (jx_type[1] == NODE || jy_type[1] == NODE || jz_type[1] == NODE) {
226  k_node = compute_shape_factor(sy_node, ymid);
227  }
228  if (jx_type[1] == CELL || jy_type[1] == CELL || jz_type[1] == CELL) {
229  k_cell = compute_shape_factor(sy_cell, ymid - 0.5);
230  }
231  amrex::Real sy_jx[depos_order + 1] = {0._rt};
232  amrex::Real sy_jy[depos_order + 1] = {0._rt};
233  amrex::Real sy_jz[depos_order + 1] = {0._rt};
234  for (int iy=0; iy<=depos_order; iy++)
235  {
236  sy_jx[iy] = ((jx_type[1] == NODE) ? amrex::Real(sy_node[iy]) : amrex::Real(sy_cell[iy]));
237  sy_jy[iy] = ((jy_type[1] == NODE) ? amrex::Real(sy_node[iy]) : amrex::Real(sy_cell[iy]));
238  sy_jz[iy] = ((jz_type[1] == NODE) ? amrex::Real(sy_node[iy]) : amrex::Real(sy_cell[iy]));
239  }
240  int const k_jx = ((jx_type[1] == NODE) ? k_node : k_cell);
241  int const k_jy = ((jy_type[1] == NODE) ? k_node : k_cell);
242  int const k_jz = ((jz_type[1] == NODE) ? k_node : k_cell);
243 #endif
244 
245  // z direction
246  // Keep these double to avoid bug in single precision
247  const double zmid = ((zp - zmin) + relative_time*vz)*dzi;
248  double sz_node[depos_order + 1] = {0.};
249  double sz_cell[depos_order + 1] = {0.};
250  int l_node = 0;
251  int l_cell = 0;
252  if (jx_type[zdir] == NODE || jy_type[zdir] == NODE || jz_type[zdir] == NODE) {
253  l_node = compute_shape_factor(sz_node, zmid);
254  }
255  if (jx_type[zdir] == CELL || jy_type[zdir] == CELL || jz_type[zdir] == CELL) {
256  l_cell = compute_shape_factor(sz_cell, zmid - 0.5);
257  }
258  amrex::Real sz_jx[depos_order + 1] = {0._rt};
259  amrex::Real sz_jy[depos_order + 1] = {0._rt};
260  amrex::Real sz_jz[depos_order + 1] = {0._rt};
261  for (int iz=0; iz<=depos_order; iz++)
262  {
263  sz_jx[iz] = ((jx_type[zdir] == NODE) ? amrex::Real(sz_node[iz]) : amrex::Real(sz_cell[iz]));
264  sz_jy[iz] = ((jy_type[zdir] == NODE) ? amrex::Real(sz_node[iz]) : amrex::Real(sz_cell[iz]));
265  sz_jz[iz] = ((jz_type[zdir] == NODE) ? amrex::Real(sz_node[iz]) : amrex::Real(sz_cell[iz]));
266  }
267  int const l_jx = ((jx_type[zdir] == NODE) ? l_node : l_cell);
268  int const l_jy = ((jy_type[zdir] == NODE) ? l_node : l_cell);
269  int const l_jz = ((jz_type[zdir] == NODE) ? l_node : l_cell);
270 
271  // Deposit current into jx_arr, jy_arr and jz_arr
272 #if defined(WARPX_DIM_1D_Z)
273  for (int iz=0; iz<=depos_order; iz++){
275  &jx_arr(lo.x+l_jx+iz, 0, 0, 0),
276  sz_jx[iz]*wqx);
278  &jy_arr(lo.x+l_jy+iz, 0, 0, 0),
279  sz_jy[iz]*wqy);
281  &jz_arr(lo.x+l_jz+iz, 0, 0, 0),
282  sz_jz[iz]*wqz);
283  }
284 #endif
285 #if defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
286  for (int iz=0; iz<=depos_order; iz++){
287  for (int ix=0; ix<=depos_order; ix++){
289  &jx_arr(lo.x+j_jx+ix, lo.y+l_jx+iz, 0, 0),
290  sx_jx[ix]*sz_jx[iz]*wqx);
292  &jy_arr(lo.x+j_jy+ix, lo.y+l_jy+iz, 0, 0),
293  sx_jy[ix]*sz_jy[iz]*wqy);
295  &jz_arr(lo.x+j_jz+ix, lo.y+l_jz+iz, 0, 0),
296  sx_jz[ix]*sz_jz[iz]*wqz);
297 #if defined(WARPX_DIM_RZ)
298  Complex xy = xy0; // Note that xy is equal to e^{i m theta}
299  for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
300  // The factor 2 on the weighting comes from the normalization of the modes
301  amrex::Gpu::Atomic::AddNoRet( &jx_arr(lo.x+j_jx+ix, lo.y+l_jx+iz, 0, 2*imode-1), 2._rt*sx_jx[ix]*sz_jx[iz]*wqx*xy.real());
302  amrex::Gpu::Atomic::AddNoRet( &jx_arr(lo.x+j_jx+ix, lo.y+l_jx+iz, 0, 2*imode ), 2._rt*sx_jx[ix]*sz_jx[iz]*wqx*xy.imag());
303  amrex::Gpu::Atomic::AddNoRet( &jy_arr(lo.x+j_jy+ix, lo.y+l_jy+iz, 0, 2*imode-1), 2._rt*sx_jy[ix]*sz_jy[iz]*wqy*xy.real());
304  amrex::Gpu::Atomic::AddNoRet( &jy_arr(lo.x+j_jy+ix, lo.y+l_jy+iz, 0, 2*imode ), 2._rt*sx_jy[ix]*sz_jy[iz]*wqy*xy.imag());
305  amrex::Gpu::Atomic::AddNoRet( &jz_arr(lo.x+j_jz+ix, lo.y+l_jz+iz, 0, 2*imode-1), 2._rt*sx_jz[ix]*sz_jz[iz]*wqz*xy.real());
306  amrex::Gpu::Atomic::AddNoRet( &jz_arr(lo.x+j_jz+ix, lo.y+l_jz+iz, 0, 2*imode ), 2._rt*sx_jz[ix]*sz_jz[iz]*wqz*xy.imag());
307  xy = xy*xy0;
308  }
309 #endif
310  }
311  }
312 #elif defined(WARPX_DIM_3D)
313  for (int iz=0; iz<=depos_order; iz++){
314  for (int iy=0; iy<=depos_order; iy++){
315  for (int ix=0; ix<=depos_order; ix++){
317  &jx_arr(lo.x+j_jx+ix, lo.y+k_jx+iy, lo.z+l_jx+iz),
318  sx_jx[ix]*sy_jx[iy]*sz_jx[iz]*wqx);
320  &jy_arr(lo.x+j_jy+ix, lo.y+k_jy+iy, lo.z+l_jy+iz),
321  sx_jy[ix]*sy_jy[iy]*sz_jy[iz]*wqy);
323  &jz_arr(lo.x+j_jz+ix, lo.y+k_jz+iy, lo.z+l_jz+iz),
324  sx_jz[ix]*sy_jz[iy]*sz_jz[iz]*wqz);
325  }
326  }
327  }
328 #endif
329  }
330  );
331 #if defined(WARPX_USE_GPUCLOCK)
332  if(cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
334  *cost += *cost_real;
335  amrex::The_Managed_Arena()->free(cost_real);
336  }
337 #endif
338 }
339 
365 template <int depos_order>
367  const amrex::ParticleReal * const wp,
368  const amrex::ParticleReal * const uxp,
369  const amrex::ParticleReal * const uyp,
370  const amrex::ParticleReal * const uzp,
371  const int * const ion_lev,
372  amrex::FArrayBox& jx_fab,
373  amrex::FArrayBox& jy_fab,
374  amrex::FArrayBox& jz_fab,
375  const long np_to_depose,
376  const amrex::Real relative_time,
377  const std::array<amrex::Real,3>& dx,
378  const std::array<amrex::Real,3>& xyzmin,
379  const amrex::Dim3 lo,
380  const amrex::Real q,
381  const int n_rz_azimuthal_modes,
382  amrex::Real* cost,
383  const long load_balance_costs_update_algo,
385  const amrex::Box& box,
386  const amrex::Geometry& geom,
387  const amrex::IntVect& a_tbox_max_size)
388 {
389 #if defined(AMREX_USE_HIP) || defined(AMREX_USE_CUDA)
390  using namespace amrex;
391 
392  auto permutation = a_bins.permutationPtr();
393 
394  amrex::Array4<amrex::Real> const& jx_arr = jx_fab.array();
395  amrex::Array4<amrex::Real> const& jy_arr = jy_fab.array();
396  amrex::Array4<amrex::Real> const& jz_arr = jz_fab.array();
397  amrex::IntVect const jx_type = jx_fab.box().type();
398  amrex::IntVect const jy_type = jy_fab.box().type();
399  amrex::IntVect const jz_type = jz_fab.box().type();
400 
401  constexpr int zdir = WARPX_ZINDEX;
402  constexpr int NODE = amrex::IndexType::NODE;
403  constexpr int CELL = amrex::IndexType::CELL;
404 
405  // Loop over particles and deposit into jx_fab, jy_fab and jz_fab
406 #if defined(WARPX_USE_GPUCLOCK)
407  amrex::Real* cost_real = nullptr;
408  if (cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
409  cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
410  *cost_real = 0._rt;
411  }
412 #endif
413  const auto dxiarr = geom.InvCellSizeArray();
414  const auto plo = geom.ProbLoArray();
415  const auto domain = geom.Domain();
416 
417  amrex::Box sample_tbox(IntVect(AMREX_D_DECL(0,0,0)), a_tbox_max_size - 1);
418  sample_tbox.grow(depos_order);
419 
420  amrex::Box sample_tbox_x = convert(sample_tbox, jx_type);
421  amrex::Box sample_tbox_y = convert(sample_tbox, jy_type);
422  amrex::Box sample_tbox_z = convert(sample_tbox, jz_type);
423 
424  const auto npts = amrex::max(sample_tbox_x.numPts(), sample_tbox_y.numPts(), sample_tbox_z.numPts());
425 
426  const int nblocks = a_bins.numBins();
427  const int threads_per_block = WarpX::shared_mem_current_tpb;
428  const auto offsets_ptr = a_bins.offsetsPtr();
429 
430  const std::size_t shared_mem_bytes = npts*sizeof(amrex::Real);
431  const amrex::IntVect bin_size = WarpX::shared_tilesize;
432  const std::size_t max_shared_mem_bytes = amrex::Gpu::Device::sharedMemPerBlock();
433  WARPX_ALWAYS_ASSERT_WITH_MESSAGE(shared_mem_bytes <= max_shared_mem_bytes,
434  "Tile size too big for GPU shared memory current deposition");
435 
436  amrex::ignore_unused(np_to_depose);
437  // Launch one thread-block per bin
439  nblocks, threads_per_block, shared_mem_bytes, amrex::Gpu::gpuStream(),
440  [=] AMREX_GPU_DEVICE () noexcept {
441 #if defined(WARPX_USE_GPUCLOCK)
442  const auto KernelTimer = ablastr::parallelization::KernelTimer(
443  cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
444  cost_real);
445 #endif
446  const int bin_id = blockIdx.x;
447  const unsigned int bin_start = offsets_ptr[bin_id];
448  const unsigned int bin_stop = offsets_ptr[bin_id+1];
449 
450  if (bin_start == bin_stop) { return; /*this bin has no particles*/ }
451 
452  // These boxes define the index space for the shared memory buffers
453  amrex::Box buffer_box;
454  {
455  ParticleReal xp, yp, zp;
456  GetPosition(permutation[bin_start], xp, yp, zp);
457 #if defined(WARPX_DIM_3D)
458  IntVect iv = IntVect(int( amrex::Math::floor((xp-plo[0]) * dxiarr[0]) ),
459  int( amrex::Math::floor((yp-plo[1]) * dxiarr[1]) ),
460  int( amrex::Math::floor((zp-plo[2]) * dxiarr[2]) ));
461 #elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
462  IntVect iv = IntVect(int( amrex::Math::floor((xp-plo[0]) * dxiarr[0]) ),
463  int( amrex::Math::floor((zp-plo[1]) * dxiarr[1]) ));
464 #elif defined(WARPX_DIM_1D_Z)
465  IntVect iv = IntVect(int( amrex::Math::floor((zp-plo[0]) * dxiarr[0]) ));
466 #endif
467  iv += domain.smallEnd();
468  getTileIndex(iv, box, true, bin_size, buffer_box);
469  }
470 
471  buffer_box.grow(depos_order);
472  Box tbox_x = convert(buffer_box, jx_type);
473  Box tbox_y = convert(buffer_box, jy_type);
474  Box tbox_z = convert(buffer_box, jz_type);
475 
477  amrex::Real* const shared = gsm.dataPtr();
478 
479  amrex::Array4<amrex::Real> const jx_buff(shared,
480  amrex::begin(tbox_x), amrex::end(tbox_x), 1);
481  amrex::Array4<amrex::Real> const jy_buff(shared,
482  amrex::begin(tbox_y), amrex::end(tbox_y), 1);
483  amrex::Array4<amrex::Real> const jz_buff(shared,
484  amrex::begin(tbox_z), amrex::end(tbox_z), 1);
485 
486  // Zero-initialize the temporary array in shared memory
487  volatile amrex::Real* vs = shared;
488  for (int i = threadIdx.x; i < npts; i += blockDim.x){
489  vs[i] = 0.0;
490  }
491  __syncthreads();
492  for (unsigned int ip_orig = bin_start+threadIdx.x; ip_orig<bin_stop; ip_orig += blockDim.x)
493  {
494  const unsigned int ip = permutation[ip_orig];
495  depositComponent<depos_order>(GetPosition, wp, uxp, uyp, uzp, ion_lev, jx_buff, jx_type,
496  relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes,
497  ip, zdir, NODE, CELL, 0);
498  }
499 
500  __syncthreads();
501  addLocalToGlobal(tbox_x, jx_arr, jx_buff);
502  for (int i = threadIdx.x; i < npts; i += blockDim.x){
503  vs[i] = 0.0;
504  }
505 
506  __syncthreads();
507  for (unsigned int ip_orig = bin_start+threadIdx.x; ip_orig<bin_stop; ip_orig += blockDim.x)
508  {
509  const unsigned int ip = permutation[ip_orig];
510  depositComponent<depos_order>(GetPosition, wp, uxp, uyp, uzp, ion_lev, jy_buff, jy_type,
511  relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes,
512  ip, zdir, NODE, CELL, 1);
513  }
514 
515  __syncthreads();
516  addLocalToGlobal(tbox_y, jy_arr, jy_buff);
517  for (int i = threadIdx.x; i < npts; i += blockDim.x){
518  vs[i] = 0.0;
519  }
520 
521  __syncthreads();
522  for (unsigned int ip_orig = bin_start+threadIdx.x; ip_orig<bin_stop; ip_orig += blockDim.x)
523  {
524  const unsigned int ip = permutation[ip_orig];
525  depositComponent<depos_order>(GetPosition, wp, uxp, uyp, uzp, ion_lev, jz_buff, jz_type,
526  relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes,
527  ip, zdir, NODE, CELL, 2);
528  }
529 
530  __syncthreads();
531  addLocalToGlobal(tbox_z, jz_arr, jz_buff);
532  });
533 #if defined(WARPX_USE_GPUCLOCK)
534  if (cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
536  *cost += *cost_real;
537  amrex::The_Managed_Arena()->free(cost_real);
538  }
539 #endif
540 #else // not using hip/cuda
541  // Note, you should never reach this part of the code. This funcion cannot be called unless
542  // using HIP/CUDA, and those things are checked prior
543  //don't use any args
544  ignore_unused( GetPosition, wp, uxp, uyp, uzp, ion_lev, jx_fab, jy_fab, jz_fab, np_to_depose, relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes, cost, load_balance_costs_update_algo, a_bins, box, geom, a_tbox_max_size);
545  WARPX_ABORT_WITH_MESSAGE("Shared memory only implemented for HIP/CUDA");
546 #endif
547 }
548 
575 template <int depos_order>
577  const amrex::ParticleReal * const wp,
578  const amrex::ParticleReal * const uxp,
579  const amrex::ParticleReal * const uyp,
580  const amrex::ParticleReal * const uzp,
581  const int * const ion_lev,
582  const amrex::Array4<amrex::Real>& Jx_arr,
583  const amrex::Array4<amrex::Real>& Jy_arr,
584  const amrex::Array4<amrex::Real>& Jz_arr,
585  const long np_to_depose,
586  const amrex::Real dt,
587  const amrex::Real relative_time,
588  const std::array<amrex::Real,3>& dx,
589  const std::array<amrex::Real, 3> xyzmin,
590  const amrex::Dim3 lo,
591  const amrex::Real q,
592  const int n_rz_azimuthal_modes,
593  amrex::Real * const cost,
594  const long load_balance_costs_update_algo)
595 {
596  using namespace amrex;
597 #if !defined(WARPX_DIM_RZ)
598  ignore_unused(n_rz_azimuthal_modes);
599 #endif
600 
601 #if !defined(AMREX_USE_GPU)
602  amrex::ignore_unused(cost, load_balance_costs_update_algo);
603 #endif
604 
605  // Whether ion_lev is a null pointer (do_ionization=0) or a real pointer
606  // (do_ionization=1)
607  bool const do_ionization = ion_lev;
608 #if !defined(WARPX_DIM_1D_Z)
609  Real const dxi = 1.0_rt / dx[0];
610 #endif
611 #if !defined(WARPX_DIM_1D_Z)
612  Real const xmin = xyzmin[0];
613 #endif
614 #if defined(WARPX_DIM_3D)
615  Real const dyi = 1.0_rt / dx[1];
616  Real const ymin = xyzmin[1];
617 #endif
618  Real const dzi = 1.0_rt / dx[2];
619  Real const zmin = xyzmin[2];
620 
621 #if defined(WARPX_DIM_3D)
622  Real const invdtdx = 1.0_rt / (dt*dx[1]*dx[2]);
623  Real const invdtdy = 1.0_rt / (dt*dx[0]*dx[2]);
624  Real const invdtdz = 1.0_rt / (dt*dx[0]*dx[1]);
625 #elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
626  Real const invdtdx = 1.0_rt / (dt*dx[2]);
627  Real const invdtdz = 1.0_rt / (dt*dx[0]);
628  Real const invvol = 1.0_rt / (dx[0]*dx[2]);
629 #elif defined(WARPX_DIM_1D_Z)
630  Real const invdtdz = 1.0_rt / (dt*dx[0]);
631  Real const invvol = 1.0_rt / (dx[2]);
632 #endif
633 
634 #if defined(WARPX_DIM_RZ)
635  Complex const I = Complex{0._rt, 1._rt};
636 #endif
637 
638  Real const clightsq = 1.0_rt / ( PhysConst::c * PhysConst::c );
639 #if !defined(WARPX_DIM_1D_Z)
640  Real constexpr one_third = 1.0_rt / 3.0_rt;
641  Real constexpr one_sixth = 1.0_rt / 6.0_rt;
642 #endif
643 
644  // Loop over particles and deposit into Jx_arr, Jy_arr and Jz_arr
645 #if defined(WARPX_USE_GPUCLOCK)
646  amrex::Real* cost_real = nullptr;
647  if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
648  cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
649  *cost_real = 0._rt;
650  }
651 #endif
653  np_to_depose,
654  [=] AMREX_GPU_DEVICE (long const ip) {
655 #if defined(WARPX_USE_GPUCLOCK)
656  const auto KernelTimer = ablastr::parallelization::KernelTimer(
657  cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
658  cost_real);
659 #endif
660 
661  // --- Get particle quantities
662  Real const gaminv = 1.0_rt/std::sqrt(1.0_rt + uxp[ip]*uxp[ip]*clightsq
663  + uyp[ip]*uyp[ip]*clightsq
664  + uzp[ip]*uzp[ip]*clightsq);
665 
666  // wqx, wqy wqz are particle current in each direction
667  Real wq = q*wp[ip];
668  if (do_ionization){
669  wq *= ion_lev[ip];
670  }
671 
672  ParticleReal xp, yp, zp;
673  GetPosition(ip, xp, yp, zp);
674 
675 #if !defined(WARPX_DIM_1D_Z)
676  Real const wqx = wq*invdtdx;
677 #endif
678 #if defined(WARPX_DIM_3D)
679  Real const wqy = wq*invdtdy;
680 #endif
681  Real const wqz = wq*invdtdz;
682 
683  // computes current and old position in grid units
684 #if defined(WARPX_DIM_RZ)
685  Real const xp_new = xp + (relative_time + 0.5_rt*dt)*uxp[ip]*gaminv;
686  Real const yp_new = yp + (relative_time + 0.5_rt*dt)*uyp[ip]*gaminv;
687  Real const xp_mid = xp_new - 0.5_rt*dt*uxp[ip]*gaminv;
688  Real const yp_mid = yp_new - 0.5_rt*dt*uyp[ip]*gaminv;
689  Real const xp_old = xp_new - dt*uxp[ip]*gaminv;
690  Real const yp_old = yp_new - dt*uyp[ip]*gaminv;
691  Real const rp_new = std::sqrt(xp_new*xp_new + yp_new*yp_new);
692  Real const rp_mid = std::sqrt(xp_mid*xp_mid + yp_mid*yp_mid);
693  Real const rp_old = std::sqrt(xp_old*xp_old + yp_old*yp_old);
694  Real costheta_new, sintheta_new;
695  if (rp_new > 0._rt) {
696  costheta_new = xp_new/rp_new;
697  sintheta_new = yp_new/rp_new;
698  } else {
699  costheta_new = 1._rt;
700  sintheta_new = 0._rt;
701  }
702  amrex::Real costheta_mid, sintheta_mid;
703  if (rp_mid > 0._rt) {
704  costheta_mid = xp_mid/rp_mid;
705  sintheta_mid = yp_mid/rp_mid;
706  } else {
707  costheta_mid = 1._rt;
708  sintheta_mid = 0._rt;
709  }
710  amrex::Real costheta_old, sintheta_old;
711  if (rp_old > 0._rt) {
712  costheta_old = xp_old/rp_old;
713  sintheta_old = yp_old/rp_old;
714  } else {
715  costheta_old = 1._rt;
716  sintheta_old = 0._rt;
717  }
718  const Complex xy_new0 = Complex{costheta_new, sintheta_new};
719  const Complex xy_mid0 = Complex{costheta_mid, sintheta_mid};
720  const Complex xy_old0 = Complex{costheta_old, sintheta_old};
721  // Keep these double to avoid bug in single precision
722  double const x_new = (rp_new - xmin)*dxi;
723  double const x_old = (rp_old - xmin)*dxi;
724 #else
725 #if !defined(WARPX_DIM_1D_Z)
726  // Keep these double to avoid bug in single precision
727  double const x_new = (xp - xmin + (relative_time + 0.5_rt*dt)*uxp[ip]*gaminv)*dxi;
728  double const x_old = x_new - dt*dxi*uxp[ip]*gaminv;
729 #endif
730 #endif
731 #if defined(WARPX_DIM_3D)
732  // Keep these double to avoid bug in single precision
733  double const y_new = (yp - ymin + (relative_time + 0.5_rt*dt)*uyp[ip]*gaminv)*dyi;
734  double const y_old = y_new - dt*dyi*uyp[ip]*gaminv;
735 #endif
736  // Keep these double to avoid bug in single precision
737  double const z_new = (zp - zmin + (relative_time + 0.5_rt*dt)*uzp[ip]*gaminv)*dzi;
738  double const z_old = z_new - dt*dzi*uzp[ip]*gaminv;
739 
740 #if defined(WARPX_DIM_RZ)
741  Real const vy = (-uxp[ip]*sintheta_mid + uyp[ip]*costheta_mid)*gaminv;
742 #elif defined(WARPX_DIM_XZ)
743  Real const vy = uyp[ip]*gaminv;
744 #elif defined(WARPX_DIM_1D_Z)
745  Real const vx = uxp[ip]*gaminv;
746  Real const vy = uyp[ip]*gaminv;
747 #endif
748 
749  // Shape factor arrays
750  // Note that there are extra values above and below
751  // to possibly hold the factor for the old particle
752  // which can be at a different grid location.
753  // Keep these double to avoid bug in single precision
754 #if !defined(WARPX_DIM_1D_Z)
755  double sx_new[depos_order + 3] = {0.};
756  double sx_old[depos_order + 3] = {0.};
757 #endif
758 #if defined(WARPX_DIM_3D)
759  // Keep these double to avoid bug in single precision
760  double sy_new[depos_order + 3] = {0.};
761  double sy_old[depos_order + 3] = {0.};
762 #endif
763  // Keep these double to avoid bug in single precision
764  double sz_new[depos_order + 3] = {0.};
765  double sz_old[depos_order + 3] = {0.};
766 
767  // --- Compute shape factors
768  // Compute shape factors for position as they are now and at old positions
769  // [ijk]_new: leftmost grid point that the particle touches
770  const Compute_shape_factor< depos_order > compute_shape_factor;
771  const Compute_shifted_shape_factor< depos_order > compute_shifted_shape_factor;
772 
773 #if !defined(WARPX_DIM_1D_Z)
774  const int i_new = compute_shape_factor(sx_new+1, x_new);
775  const int i_old = compute_shifted_shape_factor(sx_old, x_old, i_new);
776 #endif
777 #if defined(WARPX_DIM_3D)
778  const int j_new = compute_shape_factor(sy_new+1, y_new);
779  const int j_old = compute_shifted_shape_factor(sy_old, y_old, j_new);
780 #endif
781  const int k_new = compute_shape_factor(sz_new+1, z_new);
782  const int k_old = compute_shifted_shape_factor(sz_old, z_old, k_new);
783 
784  // computes min/max positions of current contributions
785 #if !defined(WARPX_DIM_1D_Z)
786  int dil = 1, diu = 1;
787  if (i_old < i_new) dil = 0;
788  if (i_old > i_new) diu = 0;
789 #endif
790 #if defined(WARPX_DIM_3D)
791  int djl = 1, dju = 1;
792  if (j_old < j_new) djl = 0;
793  if (j_old > j_new) dju = 0;
794 #endif
795  int dkl = 1, dku = 1;
796  if (k_old < k_new) dkl = 0;
797  if (k_old > k_new) dku = 0;
798 
799 #if defined(WARPX_DIM_3D)
800 
801  for (int k=dkl; k<=depos_order+2-dku; k++) {
802  for (int j=djl; j<=depos_order+2-dju; j++) {
803  amrex::Real sdxi = 0._rt;
804  for (int i=dil; i<=depos_order+1-diu; i++) {
805  sdxi += wqx*(sx_old[i] - sx_new[i])*(
806  one_third*(sy_new[j]*sz_new[k] + sy_old[j]*sz_old[k])
807  +one_sixth*(sy_new[j]*sz_old[k] + sy_old[j]*sz_new[k]));
808  amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdxi);
809  }
810  }
811  }
812  for (int k=dkl; k<=depos_order+2-dku; k++) {
813  for (int i=dil; i<=depos_order+2-diu; i++) {
814  amrex::Real sdyj = 0._rt;
815  for (int j=djl; j<=depos_order+1-dju; j++) {
816  sdyj += wqy*(sy_old[j] - sy_new[j])*(
817  one_third*(sx_new[i]*sz_new[k] + sx_old[i]*sz_old[k])
818  +one_sixth*(sx_new[i]*sz_old[k] + sx_old[i]*sz_new[k]));
819  amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdyj);
820  }
821  }
822  }
823  for (int j=djl; j<=depos_order+2-dju; j++) {
824  for (int i=dil; i<=depos_order+2-diu; i++) {
825  amrex::Real sdzk = 0._rt;
826  for (int k=dkl; k<=depos_order+1-dku; k++) {
827  sdzk += wqz*(sz_old[k] - sz_new[k])*(
828  one_third*(sx_new[i]*sy_new[j] + sx_old[i]*sy_old[j])
829  +one_sixth*(sx_new[i]*sy_old[j] + sx_old[i]*sy_new[j]));
830  amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdzk);
831  }
832  }
833  }
834 
835 #elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
836 
837  for (int k=dkl; k<=depos_order+2-dku; k++) {
838  amrex::Real sdxi = 0._rt;
839  for (int i=dil; i<=depos_order+1-diu; i++) {
840  sdxi += wqx*(sx_old[i] - sx_new[i])*0.5_rt*(sz_new[k] + sz_old[k]);
841  amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 0), sdxi);
842 #if defined(WARPX_DIM_RZ)
843  Complex xy_mid = xy_mid0; // Throughout the following loop, xy_mid takes the value e^{i m theta}
844  for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
845  // The factor 2 comes from the normalization of the modes
846  const Complex djr_cmplx = 2._rt *sdxi*xy_mid;
847  amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode-1), djr_cmplx.real());
848  amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode), djr_cmplx.imag());
849  xy_mid = xy_mid*xy_mid0;
850  }
851 #endif
852  }
853  }
854  for (int k=dkl; k<=depos_order+2-dku; k++) {
855  for (int i=dil; i<=depos_order+2-diu; i++) {
856  Real const sdyj = wq*vy*invvol*(
857  one_third*(sx_new[i]*sz_new[k] + sx_old[i]*sz_old[k])
858  +one_sixth*(sx_new[i]*sz_old[k] + sx_old[i]*sz_new[k]));
859  amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 0), sdyj);
860 #if defined(WARPX_DIM_RZ)
861  Complex xy_new = xy_new0;
862  Complex xy_mid = xy_mid0;
863  Complex xy_old = xy_old0;
864  // Throughout the following loop, xy_ takes the value e^{i m theta_}
865  for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
866  // The factor 2 comes from the normalization of the modes
867  // The minus sign comes from the different convention with respect to Davidson et al.
868  const Complex djt_cmplx = -2._rt * I*(i_new-1 + i + xmin*dxi)*wq*invdtdx/(amrex::Real)imode
869  *(Complex(sx_new[i]*sz_new[k], 0._rt)*(xy_new - xy_mid)
870  + Complex(sx_old[i]*sz_old[k], 0._rt)*(xy_mid - xy_old));
871  amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode-1), djt_cmplx.real());
872  amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode), djt_cmplx.imag());
873  xy_new = xy_new*xy_new0;
874  xy_mid = xy_mid*xy_mid0;
875  xy_old = xy_old*xy_old0;
876  }
877 #endif
878  }
879  }
880  for (int i=dil; i<=depos_order+2-diu; i++) {
881  Real sdzk = 0._rt;
882  for (int k=dkl; k<=depos_order+1-dku; k++) {
883  sdzk += wqz*(sz_old[k] - sz_new[k])*0.5_rt*(sx_new[i] + sx_old[i]);
884  amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 0), sdzk);
885 #if defined(WARPX_DIM_RZ)
886  Complex xy_mid = xy_mid0; // Throughout the following loop, xy_mid takes the value e^{i m theta}
887  for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
888  // The factor 2 comes from the normalization of the modes
889  const Complex djz_cmplx = 2._rt * sdzk * xy_mid;
890  amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode-1), djz_cmplx.real());
891  amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode), djz_cmplx.imag());
892  xy_mid = xy_mid*xy_mid0;
893  }
894 #endif
895  }
896  }
897 #elif defined(WARPX_DIM_1D_Z)
898 
899  for (int k=dkl; k<=depos_order+2-dku; k++) {
900  amrex::Real const sdxi = wq*vx*invvol*0.5_rt*(sz_old[k] + sz_new[k]);
901  amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+k_new-1+k, 0, 0, 0), sdxi);
902  }
903  for (int k=dkl; k<=depos_order+2-dku; k++) {
904  amrex::Real const sdyj = wq*vy*invvol*0.5_rt*(sz_old[k] + sz_new[k]);
905  amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+k_new-1+k, 0, 0, 0), sdyj);
906  }
907  amrex::Real sdzk = 0._rt;
908  for (int k=dkl; k<=depos_order+1-dku; k++) {
909  sdzk += wqz*(sz_old[k] - sz_new[k]);
910  amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+k_new-1+k, 0, 0, 0), sdzk);
911  }
912 #endif
913  }
914  );
915 #if defined(WARPX_USE_GPUCLOCK)
916  if(cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
918  *cost += *cost_real;
919  amrex::The_Managed_Arena()->free(cost_real);
920  }
921 #endif
922 }
923 
953 template <int depos_order>
955  const amrex::ParticleReal* const wp,
956  const amrex::ParticleReal* const uxp,
957  const amrex::ParticleReal* const uyp,
958  const amrex::ParticleReal* const uzp,
959  const int* const ion_lev,
960  amrex::FArrayBox& Dx_fab,
961  amrex::FArrayBox& Dy_fab,
962  amrex::FArrayBox& Dz_fab,
963  const long np_to_depose,
964  const amrex::Real dt,
965  const amrex::Real relative_time,
966  const std::array<amrex::Real,3>& dx,
967  const std::array<amrex::Real,3>& xyzmin,
968  const amrex::Dim3 lo,
969  const amrex::Real q,
970  const int n_rz_azimuthal_modes,
971  amrex::Real* cost,
972  const long load_balance_costs_update_algo)
973 {
974 #if defined(WARPX_DIM_RZ)
975  amrex::ignore_unused(GetPosition,
976  wp, uxp, uyp, uzp, ion_lev, Dx_fab, Dy_fab, Dz_fab,
977  np_to_depose, dt, relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes);
978  WARPX_ABORT_WITH_MESSAGE("Vay deposition not implemented in RZ geometry");
979 #endif
980 
981 #if defined(WARPX_DIM_1D_Z)
982  amrex::ignore_unused(GetPosition,
983  wp, uxp, uyp, uzp, ion_lev, Dx_fab, Dy_fab, Dz_fab,
984  np_to_depose, dt, relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes);
985  WARPX_ABORT_WITH_MESSAGE("Vay deposition not implemented in cartesian 1D geometry");
986 #endif
987 
988 #if !defined(WARPX_USE_GPUCLOCK)
989  amrex::ignore_unused(cost, load_balance_costs_update_algo);
990 #endif
991 
992 #if !(defined WARPX_DIM_RZ || defined WARPX_DIM_1D_Z)
993  amrex::ignore_unused(n_rz_azimuthal_modes);
994 
995  // If ion_lev is a null pointer, then do_ionization=0, else do_ionization=1
996  const bool do_ionization = ion_lev;
997 
998  // Inverse cell volume in each direction
999  const amrex::Real dxi = 1._rt / dx[0];
1000  const amrex::Real dzi = 1._rt / dx[2];
1001 #if defined(WARPX_DIM_3D)
1002  const amrex::Real dyi = 1._rt / dx[1];
1003 #endif
1004 
1005  // Inverse of time step
1006  const amrex::Real invdt = 1._rt / dt;
1007 
1008  // Total inverse cell volume
1009 #if defined(WARPX_DIM_XZ)
1010  const amrex::Real invvol = dxi * dzi;
1011 #elif defined(WARPX_DIM_3D)
1012  const amrex::Real invvol = dxi * dyi * dzi;
1013 #endif
1014 
1015  // Lower bound of physical domain in each direction
1016  const amrex::Real xmin = xyzmin[0];
1017  const amrex::Real zmin = xyzmin[2];
1018 #if defined(WARPX_DIM_3D)
1019  const amrex::Real ymin = xyzmin[1];
1020 #endif
1021 
1022  // Allocate temporary arrays
1023 #if defined(WARPX_DIM_3D)
1024  AMREX_ALWAYS_ASSERT(Dx_fab.box() == Dy_fab.box() && Dx_fab.box() == Dz_fab.box());
1025  amrex::FArrayBox temp_fab{Dx_fab.box(), 4};
1026 #elif defined(WARPX_DIM_XZ)
1027  AMREX_ALWAYS_ASSERT(Dx_fab.box() == Dz_fab.box());
1028  amrex::FArrayBox temp_fab{Dx_fab.box(), 2};
1029 #endif
1030  temp_fab.setVal<amrex::RunOn::Device>(0._rt);
1031  amrex::Array4<amrex::Real> const& temp_arr = temp_fab.array();
1032 
1033  // Inverse of light speed squared
1034  const amrex::Real invcsq = 1._rt / (PhysConst::c * PhysConst::c);
1035 
1036  // Arrays where D will be stored
1037  amrex::Array4<amrex::Real> const& Dx_arr = Dx_fab.array();
1038  amrex::Array4<amrex::Real> const& Dy_arr = Dy_fab.array();
1039  amrex::Array4<amrex::Real> const& Dz_arr = Dz_fab.array();
1040 
1041  // Loop over particles and deposit (Dx,Dy,Dz) into Dx_fab, Dy_fab and Dz_fab
1042 #if defined(WARPX_USE_GPUCLOCK)
1043  amrex::Real* cost_real = nullptr;
1044  if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
1045  cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
1046  *cost_real = 0._rt;
1047  }
1048 #endif
1049  amrex::ParallelFor(np_to_depose, [=] AMREX_GPU_DEVICE (long ip)
1050  {
1051 #if defined(WARPX_USE_GPUCLOCK)
1052  const auto KernelTimer = ablastr::parallelization::KernelTimer(
1053  cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
1054  cost_real);
1055 #endif
1056 
1057  // Inverse of Lorentz factor gamma
1058  const amrex::Real invgam = 1._rt / std::sqrt(1._rt + uxp[ip] * uxp[ip] * invcsq
1059  + uyp[ip] * uyp[ip] * invcsq
1060  + uzp[ip] * uzp[ip] * invcsq);
1061  // Product of particle charges and weights
1062  amrex::Real wq = q * wp[ip];
1063  if (do_ionization) wq *= ion_lev[ip];
1064 
1065  // Current particle positions (in physical units)
1066  amrex::ParticleReal xp, yp, zp;
1067  GetPosition(ip, xp, yp, zp);
1068 
1069  // Particle velocities
1070  const amrex::Real vx = uxp[ip] * invgam;
1071  const amrex::Real vy = uyp[ip] * invgam;
1072  const amrex::Real vz = uzp[ip] * invgam;
1073 
1074  // Modify the particle position to match the time of the deposition
1075  xp += relative_time * vx;
1076  yp += relative_time * vy;
1077  zp += relative_time * vz;
1078 
1079  // Particle current densities
1080 #if defined(WARPX_DIM_XZ)
1081  const amrex::Real wqy = wq * vy * invvol;
1082 #endif
1083 
1084  // Current and old particle positions in grid units
1085  // Keep these double to avoid bug in single precision.
1086  double const x_new = (xp - xmin + 0.5_rt*dt*vx) * dxi;
1087  double const x_old = (xp - xmin - 0.5_rt*dt*vx) * dxi;
1088 #if defined(WARPX_DIM_3D)
1089  // Keep these double to avoid bug in single precision.
1090  double const y_new = (yp - ymin + 0.5_rt*dt*vy) * dyi;
1091  double const y_old = (yp - ymin - 0.5_rt*dt*vy) * dyi;
1092 #endif
1093  // Keep these double to avoid bug in single precision.
1094  double const z_new = (zp - zmin + 0.5_rt*dt*vz) * dzi;
1095  double const z_old = (zp - zmin - 0.5_rt*dt*vz) * dzi;
1096 
1097  // Shape factor arrays for current and old positions (nodal)
1098  // Keep these double to avoid bug in single precision.
1099  double sx_new[depos_order+1] = {0.};
1100  double sx_old[depos_order+1] = {0.};
1101 #if defined(WARPX_DIM_3D)
1102  // Keep these double to avoid bug in single precision.
1103  double sy_new[depos_order+1] = {0.};
1104  double sy_old[depos_order+1] = {0.};
1105 #endif
1106  // Keep these double to avoid bug in single precision.
1107  double sz_new[depos_order+1] = {0.};
1108  double sz_old[depos_order+1] = {0.};
1109 
1110  // Compute shape factors for current positions
1111 
1112  // i_new leftmost grid point in x that the particle touches
1113  // sx_new shape factor along x for the centering of each current
1114  Compute_shape_factor< depos_order > const compute_shape_factor;
1115  const int i_new = compute_shape_factor(sx_new, x_new);
1116 #if defined(WARPX_DIM_3D)
1117  // j_new leftmost grid point in y that the particle touches
1118  // sy_new shape factor along y for the centering of each current
1119  const int j_new = compute_shape_factor(sy_new, y_new);
1120 #endif
1121  // k_new leftmost grid point in z that the particle touches
1122  // sz_new shape factor along z for the centering of each current
1123  const int k_new = compute_shape_factor(sz_new, z_new);
1124 
1125  // Compute shape factors for old positions
1126 
1127  // i_old leftmost grid point in x that the particle touches
1128  // sx_old shape factor along x for the centering of each current
1129  const int i_old = compute_shape_factor(sx_old, x_old);
1130 #if defined(WARPX_DIM_3D)
1131  // j_old leftmost grid point in y that the particle touches
1132  // sy_old shape factor along y for the centering of each current
1133  const int j_old = compute_shape_factor(sy_old, y_old);
1134 #endif
1135  // k_old leftmost grid point in z that the particle touches
1136  // sz_old shape factor along z for the centering of each current
1137  const int k_old = compute_shape_factor(sz_old, z_old);
1138 
1139  // Deposit current into Dx_arr, Dy_arr and Dz_arr
1140 #if defined(WARPX_DIM_XZ)
1141 
1142  for (int k=0; k<=depos_order; k++) {
1143  for (int i=0; i<=depos_order; i++) {
1144 
1145  // Re-casting sx_new and sz_new from double to amrex::Real so that
1146  // Atomic::Add has consistent types in its argument
1147  auto const sxn_szn = static_cast<amrex::Real>(sx_new[i] * sz_new[k]);
1148  auto const sxo_szn = static_cast<amrex::Real>(sx_old[i] * sz_new[k]);
1149  auto const sxn_szo = static_cast<amrex::Real>(sx_new[i] * sz_old[k]);
1150  auto const sxo_szo = static_cast<amrex::Real>(sx_old[i] * sz_old[k]);
1151 
1152  if (i_new == i_old && k_new == k_old) {
1153  // temp arrays for Dx and Dz
1154  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + k_new + k, 0, 0),
1155  wq * invvol * invdt * (sxn_szn - sxo_szo));
1156 
1157  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + k_new + k, 0, 1),
1158  wq * invvol * invdt * (sxn_szo - sxo_szn));
1159 
1160  // Dy
1161  amrex::Gpu::Atomic::AddNoRet(&Dy_arr(lo.x + i_new + i, lo.y + k_new + k, 0, 0),
1162  wqy * 0.25_rt * (sxn_szn + sxn_szo + sxo_szn + sxo_szo));
1163  } else {
1164  // temp arrays for Dx and Dz
1165  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + k_new + k, 0, 0),
1166  wq * invvol * invdt * sxn_szn);
1167 
1168  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + k_old + k, 0, 0),
1169  - wq * invvol * invdt * sxo_szo);
1170 
1171  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + k_old + k, 0, 1),
1172  wq * invvol * invdt * sxn_szo);
1173 
1174  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + k_new + k, 0, 1),
1175  - wq * invvol * invdt * sxo_szn);
1176 
1177  // Dy
1178  amrex::Gpu::Atomic::AddNoRet(&Dy_arr(lo.x + i_new + i, lo.y + k_new + k, 0, 0),
1179  wqy * 0.25_rt * sxn_szn);
1180 
1181  amrex::Gpu::Atomic::AddNoRet(&Dy_arr(lo.x + i_new + i, lo.y + k_old + k, 0, 0),
1182  wqy * 0.25_rt * sxn_szo);
1183 
1184  amrex::Gpu::Atomic::AddNoRet(&Dy_arr(lo.x + i_old + i, lo.y + k_new + k, 0, 0),
1185  wqy * 0.25_rt * sxo_szn);
1186 
1187  amrex::Gpu::Atomic::AddNoRet(&Dy_arr(lo.x + i_old + i, lo.y + k_old + k, 0, 0),
1188  wqy * 0.25_rt * sxo_szo);
1189  }
1190 
1191  }
1192  }
1193 
1194 #elif defined(WARPX_DIM_3D)
1195 
1196  for (int k=0; k<=depos_order; k++) {
1197  for (int j=0; j<=depos_order; j++) {
1198 
1199  auto const syn_szn = static_cast<amrex::Real>(sy_new[j] * sz_new[k]);
1200  auto const syo_szn = static_cast<amrex::Real>(sy_old[j] * sz_new[k]);
1201  auto const syn_szo = static_cast<amrex::Real>(sy_new[j] * sz_old[k]);
1202  auto const syo_szo = static_cast<amrex::Real>(sy_old[j] * sz_old[k]);
1203 
1204  for (int i=0; i<=depos_order; i++) {
1205 
1206  auto const sxn_syn_szn = static_cast<amrex::Real>(sx_new[i]) * syn_szn;
1207  auto const sxo_syn_szn = static_cast<amrex::Real>(sx_old[i]) * syn_szn;
1208  auto const sxn_syo_szn = static_cast<amrex::Real>(sx_new[i]) * syo_szn;
1209  auto const sxo_syo_szn = static_cast<amrex::Real>(sx_old[i]) * syo_szn;
1210  auto const sxn_syn_szo = static_cast<amrex::Real>(sx_new[i]) * syn_szo;
1211  auto const sxo_syn_szo = static_cast<amrex::Real>(sx_old[i]) * syn_szo;
1212  auto const sxn_syo_szo = static_cast<amrex::Real>(sx_new[i]) * syo_szo;
1213  auto const sxo_syo_szo = static_cast<amrex::Real>(sx_old[i]) * syo_szo;
1214 
1215  if (i_new == i_old && j_new == j_old && k_new == k_old) {
1216  // temp arrays for Dx, Dy and Dz
1217  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_new + k, 0),
1218  wq * invvol * invdt * (sxn_syn_szn - sxo_syo_szo));
1219 
1220  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_new + k, 1),
1221  wq * invvol * invdt * (sxn_syn_szo - sxo_syo_szn));
1222 
1223  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_new + k, 2),
1224  wq * invvol * invdt * (sxn_syo_szn - sxo_syn_szo));
1225 
1226  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_new + k, 3),
1227  wq * invvol * invdt * (sxo_syn_szn - sxn_syo_szo));
1228  } else {
1229  // temp arrays for Dx, Dy and Dz
1230  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_new + k, 0),
1231  wq * invvol * invdt * sxn_syn_szn);
1232 
1233  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + j_old + j, lo.z + k_old + k, 0),
1234  - wq * invvol * invdt * sxo_syo_szo);
1235 
1236  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_old + k, 1),
1237  wq * invvol * invdt * sxn_syn_szo);
1238 
1239  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + j_old + j, lo.z + k_new + k, 1),
1240  - wq * invvol * invdt * sxo_syo_szn);
1241 
1242  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_old + j, lo.z + k_new + k, 2),
1243  wq * invvol * invdt * sxn_syo_szn);
1244 
1245  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + j_new + j, lo.z + k_old + k, 2),
1246  - wq * invvol * invdt * sxo_syn_szo);
1247 
1248  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + j_new + j, lo.z + k_new + k, 3),
1249  wq * invvol * invdt * sxo_syn_szn);
1250 
1251  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_old + j, lo.z + k_old + k, 3),
1252  - wq * invvol * invdt * sxn_syo_szo);
1253  }
1254  }
1255  }
1256  }
1257 #endif
1258  } );
1259 
1260 #if defined(WARPX_DIM_3D)
1261  amrex::ParallelFor(Dx_fab.box(), [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
1262  {
1263  const amrex::Real t_a = temp_arr(i,j,k,0);
1264  const amrex::Real t_b = temp_arr(i,j,k,1);
1265  const amrex::Real t_c = temp_arr(i,j,k,2);
1266  const amrex::Real t_d = temp_arr(i,j,k,3);
1267  Dx_arr(i,j,k) += (1._rt/6._rt)*(2_rt*t_a + t_b + t_c - 2._rt*t_d);
1268  Dy_arr(i,j,k) += (1._rt/6._rt)*(2_rt*t_a + t_b - 2._rt*t_c + t_d);
1269  Dz_arr(i,j,k) += (1._rt/6._rt)*(2_rt*t_a - 2._rt*t_b + t_c + t_d);
1270  });
1271 #elif defined(WARPX_DIM_XZ)
1272  amrex::ParallelFor(Dx_fab.box(), [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept
1273  {
1274  const amrex::Real t_a = temp_arr(i,j,0,0);
1275  const amrex::Real t_b = temp_arr(i,j,0,1);
1276  Dx_arr(i,j,0) += (0.5_rt)*(t_a + t_b);
1277  Dz_arr(i,j,0) += (0.5_rt)*(t_a - t_b);
1278  });
1279 #endif
1280  // Synchronize so that temp_fab can be safely deallocated in its destructor
1282 
1283 
1284 # if defined(WARPX_USE_GPUCLOCK)
1285  if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
1287  *cost += *cost_real;
1288  amrex::The_Managed_Arena()->free(cost_real);
1289  }
1290 # endif
1291 #endif // #if !(defined WARPX_DIM_RZ || defined WARPX_DIM_1D_Z)
1292 }
1293 #endif // CURRENTDEPOSITION_H_
#define AMREX_ALWAYS_ASSERT(EX)
#define AMREX_GPU_DEVICE
#define AMREX_D_DECL(a, b, c)
void doDepositionShapeN(const GetParticlePosition &GetPosition, const amrex::ParticleReal *const wp, const amrex::ParticleReal *const uxp, const amrex::ParticleReal *const uyp, const amrex::ParticleReal *const uzp, const int *const ion_lev, amrex::FArrayBox &jx_fab, amrex::FArrayBox &jy_fab, amrex::FArrayBox &jz_fab, const long np_to_depose, const amrex::Real relative_time, const std::array< amrex::Real, 3 > &dx, const std::array< amrex::Real, 3 > &xyzmin, const amrex::Dim3 lo, const amrex::Real q, const int n_rz_azimuthal_modes, amrex::Real *cost, const long load_balance_costs_update_algo)
Current Deposition for thread thread_num.
Definition: CurrentDeposition.H:54
void doEsirkepovDepositionShapeN(const GetParticlePosition &GetPosition, const amrex::ParticleReal *const wp, const amrex::ParticleReal *const uxp, const amrex::ParticleReal *const uyp, const amrex::ParticleReal *const uzp, const int *const ion_lev, const amrex::Array4< amrex::Real > &Jx_arr, const amrex::Array4< amrex::Real > &Jy_arr, const amrex::Array4< amrex::Real > &Jz_arr, const long np_to_depose, const amrex::Real dt, const amrex::Real relative_time, const std::array< amrex::Real, 3 > &dx, const std::array< amrex::Real, 3 > xyzmin, const amrex::Dim3 lo, const amrex::Real q, const int n_rz_azimuthal_modes, amrex::Real *const cost, const long load_balance_costs_update_algo)
Esirkepov Current Deposition for thread thread_num.
Definition: CurrentDeposition.H:576
void doVayDepositionShapeN(const GetParticlePosition &GetPosition, const amrex::ParticleReal *const wp, const amrex::ParticleReal *const uxp, const amrex::ParticleReal *const uyp, const amrex::ParticleReal *const uzp, const int *const ion_lev, amrex::FArrayBox &Dx_fab, amrex::FArrayBox &Dy_fab, amrex::FArrayBox &Dz_fab, const long np_to_depose, const amrex::Real dt, const amrex::Real relative_time, const std::array< amrex::Real, 3 > &dx, const std::array< amrex::Real, 3 > &xyzmin, const amrex::Dim3 lo, const amrex::Real q, const int n_rz_azimuthal_modes, amrex::Real *cost, const long load_balance_costs_update_algo)
Vay current deposition (Vay et al, 2013) for thread thread_num: deposit D in real space and store the...
Definition: CurrentDeposition.H:954
void doDepositionSharedShapeN(const GetParticlePosition &GetPosition, const amrex::ParticleReal *const wp, const amrex::ParticleReal *const uxp, const amrex::ParticleReal *const uyp, const amrex::ParticleReal *const uzp, const int *const ion_lev, amrex::FArrayBox &jx_fab, amrex::FArrayBox &jy_fab, amrex::FArrayBox &jz_fab, const long np_to_depose, const amrex::Real relative_time, const std::array< amrex::Real, 3 > &dx, const std::array< amrex::Real, 3 > &xyzmin, const amrex::Dim3 lo, const amrex::Real q, const int n_rz_azimuthal_modes, amrex::Real *cost, const long load_balance_costs_update_algo, const amrex::DenseBins< WarpXParticleContainer::ParticleType > &a_bins, const amrex::Box &box, const amrex::Geometry &geom, const amrex::IntVect &a_tbox_max_size)
Current Deposition for thread thread_num using shared memory.
Definition: CurrentDeposition.H:366
#define WARPX_ABORT_WITH_MESSAGE(MSG)
Definition: TextMsg.H:15
#define WARPX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition: TextMsg.H:13
amrex::GpuComplex< amrex::Real > Complex
Definition: WarpX_Complex.H:22
NODE
static amrex::IntVect shared_tilesize
tileSize to use for shared current deposition operations
Definition: WarpX.H:251
static int shared_mem_current_tpb
number of threads to use per block in shared deposition
Definition: WarpX.H:248
Defines a timer object to be used on GPU; measures summed thread cycles.
Definition: KernelTimer.H:27
virtual void free(void *pt)=0
virtual void * alloc(std::size_t sz)=0
AMREX_FORCE_INLINE Array4< Real const > array() const noexcept
const Box & box() const noexcept
AMREX_GPU_HOST_DEVICE Long numPts() const noexcept
AMREX_GPU_HOST_DEVICE Box & grow(int i) noexcept
GpuArray< Real, AMREX_SPACEDIM > InvCellSizeArray() const noexcept
index_type * offsetsPtr() noexcept
index_type * permutationPtr() noexcept
Long numBins() const noexcept
GpuArray< Real, AMREX_SPACEDIM > ProbLoArray() const noexcept
const Box & Domain() const noexcept
static std::size_t sharedMemPerBlock() noexcept
static constexpr auto c
vacuum speed of light [m/s]
Definition: constant.H:44
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
void streamSynchronize() noexcept
gpuStream_t gpuStream() noexcept
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 end(Box const &box) noexcept
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box convert(const Box &b, const IntVect &typ) noexcept
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & max(const T &a, const T &b) noexcept
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 begin(Box const &box) noexcept
void launch(T const &n, L &&f) noexcept
Arena * The_Managed_Arena()
std::enable_if_t< std::is_integral< T >::value > ParallelFor(TypeList< CTOs... >, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int getTileIndex(const IntVect &iv, const Box &box, const bool a_do_tiling, const IntVect &a_tile_size, Box &tbx)
i
Definition: check_interp_points_and_weights.py:174
float dt
Definition: stencil.py:440
tuple dx
lab frame
Definition: stencil.py:427
xmin
Definition: stencil.py:422
Definition: ShapeFactors.H:29
Definition: ShapeFactors.H:84
Functor that can be used to extract the positions of the macroparticles inside a ParallelFor kernel.
Definition: GetAndSetPosition.H:53
@ GpuClock
Definition: WarpXAlgorithmSelection.H:125
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE T real() const noexcept
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE T imag() const noexcept
AMREX_GPU_DEVICE T * dataPtr() noexcept