WarpX
CurrentDeposition.H
Go to the documentation of this file.
1 /* Copyright 2019 Axel Huebl, David Grote, Maxence Thevenet
2  * Remi Lehe, Weiqun Zhang, Michael Rowan
3  *
4  * This file is part of WarpX.
5  *
6  * License: BSD-3-Clause-LBNL
7  */
8 #ifndef CURRENTDEPOSITION_H_
9 #define CURRENTDEPOSITION_H_
10 
14 #include "Particles/ShapeFactors.H"
15 #include "Utils/TextMsg.H"
17 #include "Utils/WarpXConst.H"
18 #ifdef WARPX_DIM_RZ
19 # include "Utils/WarpX_Complex.H"
20 #endif
21 
22 #include "WarpX.H" // todo: remove include and pass globals as args
23 
24 #include <AMReX.H>
25 #include <AMReX_Arena.H>
26 #include <AMReX_Array4.H>
27 #include <AMReX_REAL.H>
28 
53 template <int depos_order>
55  const amrex::ParticleReal * const wp,
56  const amrex::ParticleReal * const uxp,
57  const amrex::ParticleReal * const uyp,
58  const amrex::ParticleReal * const uzp,
59  const int* ion_lev,
60  amrex::FArrayBox& jx_fab,
61  amrex::FArrayBox& jy_fab,
62  amrex::FArrayBox& jz_fab,
63  long np_to_depose,
64  amrex::Real relative_time,
65  const std::array<amrex::Real,3>& dx,
66  const std::array<amrex::Real,3>& xyzmin,
67  amrex::Dim3 lo,
68  amrex::Real q,
69  int n_rz_azimuthal_modes,
70  amrex::Real* cost,
71  long load_balance_costs_update_algo)
72 {
73  using namespace amrex::literals;
74 
75 #if !defined(WARPX_DIM_RZ)
76  amrex::ignore_unused(n_rz_azimuthal_modes);
77 #endif
78 
79 #if !defined(AMREX_USE_GPU)
80  amrex::ignore_unused(cost, load_balance_costs_update_algo);
81 #endif
82 
83  // Whether ion_lev is a null pointer (do_ionization=0) or a real pointer
84  // (do_ionization=1)
85  const bool do_ionization = ion_lev;
86  const amrex::Real dzi = 1.0_rt/dx[2];
87 #if defined(WARPX_DIM_1D_Z)
88  const amrex::Real invvol = dzi;
89 #endif
90 #if defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
91  const amrex::Real dxi = 1.0_rt/dx[0];
92  const amrex::Real invvol = dxi*dzi;
93 #elif defined(WARPX_DIM_3D)
94  const amrex::Real dxi = 1.0_rt/dx[0];
95  const amrex::Real dyi = 1.0_rt/dx[1];
96  const amrex::Real invvol = dxi*dyi*dzi;
97 #endif
98 
99 #if (AMREX_SPACEDIM >= 2)
100  const amrex::Real xmin = xyzmin[0];
101 #endif
102 #if defined(WARPX_DIM_3D)
103  const amrex::Real ymin = xyzmin[1];
104 #endif
105  const amrex::Real zmin = xyzmin[2];
106 
107  const amrex::Real clightsq = 1.0_rt/PhysConst::c/PhysConst::c;
108 
109  amrex::Array4<amrex::Real> const& jx_arr = jx_fab.array();
110  amrex::Array4<amrex::Real> const& jy_arr = jy_fab.array();
111  amrex::Array4<amrex::Real> const& jz_arr = jz_fab.array();
112  amrex::IntVect const jx_type = jx_fab.box().type();
113  amrex::IntVect const jy_type = jy_fab.box().type();
114  amrex::IntVect const jz_type = jz_fab.box().type();
115 
116  constexpr int zdir = WARPX_ZINDEX;
117  constexpr int NODE = amrex::IndexType::NODE;
118  constexpr int CELL = amrex::IndexType::CELL;
119 
120  // Loop over particles and deposit into jx_fab, jy_fab and jz_fab
121 #if defined(WARPX_USE_GPUCLOCK)
122  amrex::Real* cost_real = nullptr;
123  if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
124  cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
125  *cost_real = 0._rt;
126  }
127 #endif
129  np_to_depose,
130  [=] AMREX_GPU_DEVICE (long ip) {
131 #if defined(WARPX_USE_GPUCLOCK)
132  const auto KernelTimer = ablastr::parallelization::KernelTimer(
133  cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
134  cost_real);
135 #endif
136 
137  // --- Get particle quantities
138  const amrex::Real gaminv = 1.0_rt/std::sqrt(1.0_rt + uxp[ip]*uxp[ip]*clightsq
139  + uyp[ip]*uyp[ip]*clightsq
140  + uzp[ip]*uzp[ip]*clightsq);
141  amrex::Real wq = q*wp[ip];
142  if (do_ionization){
143  wq *= ion_lev[ip];
144  }
145 
146  amrex::ParticleReal xp, yp, zp;
147  GetPosition(ip, xp, yp, zp);
148 
149  const amrex::Real vx = uxp[ip]*gaminv;
150  const amrex::Real vy = uyp[ip]*gaminv;
151  const amrex::Real vz = uzp[ip]*gaminv;
152  // wqx, wqy wqz are particle current in each direction
153 #if defined(WARPX_DIM_RZ)
154  // In RZ, wqx is actually wqr, and wqy is wqtheta
155  // Convert to cylinderical at the mid point
156  const amrex::Real xpmid = xp + relative_time*vx;
157  const amrex::Real ypmid = yp + relative_time*vy;
158  const amrex::Real rpmid = std::sqrt(xpmid*xpmid + ypmid*ypmid);
159  amrex::Real costheta;
160  amrex::Real sintheta;
161  if (rpmid > 0._rt) {
162  costheta = xpmid/rpmid;
163  sintheta = ypmid/rpmid;
164  } else {
165  costheta = 1._rt;
166  sintheta = 0._rt;
167  }
168  const Complex xy0 = Complex{costheta, sintheta};
169  const amrex::Real wqx = wq*invvol*(+vx*costheta + vy*sintheta);
170  const amrex::Real wqy = wq*invvol*(-vx*sintheta + vy*costheta);
171 #else
172  const amrex::Real wqx = wq*invvol*vx;
173  const amrex::Real wqy = wq*invvol*vy;
174 #endif
175  const amrex::Real wqz = wq*invvol*vz;
176 
177  // --- Compute shape factors
178  Compute_shape_factor< depos_order > const compute_shape_factor;
179 #if (AMREX_SPACEDIM >= 2)
180  // x direction
181  // Get particle position after 1/2 push back in position
182 #if defined(WARPX_DIM_RZ)
183  // Keep these double to avoid bug in single precision
184  const double xmid = (rpmid - xmin)*dxi;
185 #else
186  const double xmid = ((xp - xmin) + relative_time*vx)*dxi;
187 #endif
188  // j_j[xyz] leftmost grid point in x that the particle touches for the centering of each current
189  // sx_j[xyz] shape factor along x for the centering of each current
190  // There are only two possible centerings, node or cell centered, so at most only two shape factor
191  // arrays will be needed.
192  // Keep these double to avoid bug in single precision
193  double sx_node[depos_order + 1] = {0.};
194  double sx_cell[depos_order + 1] = {0.};
195  int j_node = 0;
196  int j_cell = 0;
197  if (jx_type[0] == NODE || jy_type[0] == NODE || jz_type[0] == NODE) {
198  j_node = compute_shape_factor(sx_node, xmid);
199  }
200  if (jx_type[0] == CELL || jy_type[0] == CELL || jz_type[0] == CELL) {
201  j_cell = compute_shape_factor(sx_cell, xmid - 0.5);
202  }
203 
204  amrex::Real sx_jx[depos_order + 1] = {0._rt};
205  amrex::Real sx_jy[depos_order + 1] = {0._rt};
206  amrex::Real sx_jz[depos_order + 1] = {0._rt};
207  for (int ix=0; ix<=depos_order; ix++)
208  {
209  sx_jx[ix] = ((jx_type[0] == NODE) ? amrex::Real(sx_node[ix]) : amrex::Real(sx_cell[ix]));
210  sx_jy[ix] = ((jy_type[0] == NODE) ? amrex::Real(sx_node[ix]) : amrex::Real(sx_cell[ix]));
211  sx_jz[ix] = ((jz_type[0] == NODE) ? amrex::Real(sx_node[ix]) : amrex::Real(sx_cell[ix]));
212  }
213 
214  int const j_jx = ((jx_type[0] == NODE) ? j_node : j_cell);
215  int const j_jy = ((jy_type[0] == NODE) ? j_node : j_cell);
216  int const j_jz = ((jz_type[0] == NODE) ? j_node : j_cell);
217 #endif //AMREX_SPACEDIM >= 2
218 
219 #if defined(WARPX_DIM_3D)
220  // y direction
221  // Keep these double to avoid bug in single precision
222  const double ymid = ((yp - ymin) + relative_time*vy)*dyi;
223  double sy_node[depos_order + 1] = {0.};
224  double sy_cell[depos_order + 1] = {0.};
225  int k_node = 0;
226  int k_cell = 0;
227  if (jx_type[1] == NODE || jy_type[1] == NODE || jz_type[1] == NODE) {
228  k_node = compute_shape_factor(sy_node, ymid);
229  }
230  if (jx_type[1] == CELL || jy_type[1] == CELL || jz_type[1] == CELL) {
231  k_cell = compute_shape_factor(sy_cell, ymid - 0.5);
232  }
233  amrex::Real sy_jx[depos_order + 1] = {0._rt};
234  amrex::Real sy_jy[depos_order + 1] = {0._rt};
235  amrex::Real sy_jz[depos_order + 1] = {0._rt};
236  for (int iy=0; iy<=depos_order; iy++)
237  {
238  sy_jx[iy] = ((jx_type[1] == NODE) ? amrex::Real(sy_node[iy]) : amrex::Real(sy_cell[iy]));
239  sy_jy[iy] = ((jy_type[1] == NODE) ? amrex::Real(sy_node[iy]) : amrex::Real(sy_cell[iy]));
240  sy_jz[iy] = ((jz_type[1] == NODE) ? amrex::Real(sy_node[iy]) : amrex::Real(sy_cell[iy]));
241  }
242  int const k_jx = ((jx_type[1] == NODE) ? k_node : k_cell);
243  int const k_jy = ((jy_type[1] == NODE) ? k_node : k_cell);
244  int const k_jz = ((jz_type[1] == NODE) ? k_node : k_cell);
245 #endif
246 
247  // z direction
248  // Keep these double to avoid bug in single precision
249  const double zmid = ((zp - zmin) + relative_time*vz)*dzi;
250  double sz_node[depos_order + 1] = {0.};
251  double sz_cell[depos_order + 1] = {0.};
252  int l_node = 0;
253  int l_cell = 0;
254  if (jx_type[zdir] == NODE || jy_type[zdir] == NODE || jz_type[zdir] == NODE) {
255  l_node = compute_shape_factor(sz_node, zmid);
256  }
257  if (jx_type[zdir] == CELL || jy_type[zdir] == CELL || jz_type[zdir] == CELL) {
258  l_cell = compute_shape_factor(sz_cell, zmid - 0.5);
259  }
260  amrex::Real sz_jx[depos_order + 1] = {0._rt};
261  amrex::Real sz_jy[depos_order + 1] = {0._rt};
262  amrex::Real sz_jz[depos_order + 1] = {0._rt};
263  for (int iz=0; iz<=depos_order; iz++)
264  {
265  sz_jx[iz] = ((jx_type[zdir] == NODE) ? amrex::Real(sz_node[iz]) : amrex::Real(sz_cell[iz]));
266  sz_jy[iz] = ((jy_type[zdir] == NODE) ? amrex::Real(sz_node[iz]) : amrex::Real(sz_cell[iz]));
267  sz_jz[iz] = ((jz_type[zdir] == NODE) ? amrex::Real(sz_node[iz]) : amrex::Real(sz_cell[iz]));
268  }
269  int const l_jx = ((jx_type[zdir] == NODE) ? l_node : l_cell);
270  int const l_jy = ((jy_type[zdir] == NODE) ? l_node : l_cell);
271  int const l_jz = ((jz_type[zdir] == NODE) ? l_node : l_cell);
272 
273  // Deposit current into jx_arr, jy_arr and jz_arr
274 #if defined(WARPX_DIM_1D_Z)
275  for (int iz=0; iz<=depos_order; iz++){
277  &jx_arr(lo.x+l_jx+iz, 0, 0, 0),
278  sz_jx[iz]*wqx);
280  &jy_arr(lo.x+l_jy+iz, 0, 0, 0),
281  sz_jy[iz]*wqy);
283  &jz_arr(lo.x+l_jz+iz, 0, 0, 0),
284  sz_jz[iz]*wqz);
285  }
286 #endif
287 #if defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
288  for (int iz=0; iz<=depos_order; iz++){
289  for (int ix=0; ix<=depos_order; ix++){
291  &jx_arr(lo.x+j_jx+ix, lo.y+l_jx+iz, 0, 0),
292  sx_jx[ix]*sz_jx[iz]*wqx);
294  &jy_arr(lo.x+j_jy+ix, lo.y+l_jy+iz, 0, 0),
295  sx_jy[ix]*sz_jy[iz]*wqy);
297  &jz_arr(lo.x+j_jz+ix, lo.y+l_jz+iz, 0, 0),
298  sx_jz[ix]*sz_jz[iz]*wqz);
299 #if defined(WARPX_DIM_RZ)
300  Complex xy = xy0; // Note that xy is equal to e^{i m theta}
301  for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
302  // The factor 2 on the weighting comes from the normalization of the modes
303  amrex::Gpu::Atomic::AddNoRet( &jx_arr(lo.x+j_jx+ix, lo.y+l_jx+iz, 0, 2*imode-1), 2._rt*sx_jx[ix]*sz_jx[iz]*wqx*xy.real());
304  amrex::Gpu::Atomic::AddNoRet( &jx_arr(lo.x+j_jx+ix, lo.y+l_jx+iz, 0, 2*imode ), 2._rt*sx_jx[ix]*sz_jx[iz]*wqx*xy.imag());
305  amrex::Gpu::Atomic::AddNoRet( &jy_arr(lo.x+j_jy+ix, lo.y+l_jy+iz, 0, 2*imode-1), 2._rt*sx_jy[ix]*sz_jy[iz]*wqy*xy.real());
306  amrex::Gpu::Atomic::AddNoRet( &jy_arr(lo.x+j_jy+ix, lo.y+l_jy+iz, 0, 2*imode ), 2._rt*sx_jy[ix]*sz_jy[iz]*wqy*xy.imag());
307  amrex::Gpu::Atomic::AddNoRet( &jz_arr(lo.x+j_jz+ix, lo.y+l_jz+iz, 0, 2*imode-1), 2._rt*sx_jz[ix]*sz_jz[iz]*wqz*xy.real());
308  amrex::Gpu::Atomic::AddNoRet( &jz_arr(lo.x+j_jz+ix, lo.y+l_jz+iz, 0, 2*imode ), 2._rt*sx_jz[ix]*sz_jz[iz]*wqz*xy.imag());
309  xy = xy*xy0;
310  }
311 #endif
312  }
313  }
314 #elif defined(WARPX_DIM_3D)
315  for (int iz=0; iz<=depos_order; iz++){
316  for (int iy=0; iy<=depos_order; iy++){
317  for (int ix=0; ix<=depos_order; ix++){
319  &jx_arr(lo.x+j_jx+ix, lo.y+k_jx+iy, lo.z+l_jx+iz),
320  sx_jx[ix]*sy_jx[iy]*sz_jx[iz]*wqx);
322  &jy_arr(lo.x+j_jy+ix, lo.y+k_jy+iy, lo.z+l_jy+iz),
323  sx_jy[ix]*sy_jy[iy]*sz_jy[iz]*wqy);
325  &jz_arr(lo.x+j_jz+ix, lo.y+k_jz+iy, lo.z+l_jz+iz),
326  sx_jz[ix]*sy_jz[iy]*sz_jz[iz]*wqz);
327  }
328  }
329  }
330 #endif
331  }
332  );
333 #if defined(WARPX_USE_GPUCLOCK)
334  if(cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
336  *cost += *cost_real;
337  amrex::The_Managed_Arena()->free(cost_real);
338  }
339 #endif
340 }
341 
367 template <int depos_order>
369  const amrex::ParticleReal * const wp,
370  const amrex::ParticleReal * const uxp,
371  const amrex::ParticleReal * const uyp,
372  const amrex::ParticleReal * const uzp,
373  const int* ion_lev,
374  amrex::FArrayBox& jx_fab,
375  amrex::FArrayBox& jy_fab,
376  amrex::FArrayBox& jz_fab,
377  long np_to_depose,
378  const amrex::Real relative_time,
379  const std::array<amrex::Real,3>& dx,
380  const std::array<amrex::Real,3>& xyzmin,
381  amrex::Dim3 lo,
382  amrex::Real q,
383  int n_rz_azimuthal_modes,
384  amrex::Real* cost,
385  long load_balance_costs_update_algo,
387  const amrex::Box& box,
388  const amrex::Geometry& geom,
389  const amrex::IntVect& a_tbox_max_size)
390 {
391  using namespace amrex::literals;
392 
393 #if defined(AMREX_USE_HIP) || defined(AMREX_USE_CUDA)
394  using namespace amrex;
395 
396  auto permutation = a_bins.permutationPtr();
397 
398  amrex::Array4<amrex::Real> const& jx_arr = jx_fab.array();
399  amrex::Array4<amrex::Real> const& jy_arr = jy_fab.array();
400  amrex::Array4<amrex::Real> const& jz_arr = jz_fab.array();
401  amrex::IntVect const jx_type = jx_fab.box().type();
402  amrex::IntVect const jy_type = jy_fab.box().type();
403  amrex::IntVect const jz_type = jz_fab.box().type();
404 
405  constexpr int zdir = WARPX_ZINDEX;
406  constexpr int NODE = amrex::IndexType::NODE;
407  constexpr int CELL = amrex::IndexType::CELL;
408 
409  // Loop over particles and deposit into jx_fab, jy_fab and jz_fab
410 #if defined(WARPX_USE_GPUCLOCK)
411  amrex::Real* cost_real = nullptr;
412  if (cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
413  cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
414  *cost_real = 0._rt;
415  }
416 #endif
417  const auto dxiarr = geom.InvCellSizeArray();
418  const auto plo = geom.ProbLoArray();
419  const auto domain = geom.Domain();
420 
421  amrex::Box sample_tbox(IntVect(AMREX_D_DECL(0,0,0)), a_tbox_max_size - 1);
422  sample_tbox.grow(depos_order);
423 
424  amrex::Box sample_tbox_x = convert(sample_tbox, jx_type);
425  amrex::Box sample_tbox_y = convert(sample_tbox, jy_type);
426  amrex::Box sample_tbox_z = convert(sample_tbox, jz_type);
427 
428  const auto npts = amrex::max(sample_tbox_x.numPts(), sample_tbox_y.numPts(), sample_tbox_z.numPts());
429 
430  const int nblocks = a_bins.numBins();
431  const int threads_per_block = WarpX::shared_mem_current_tpb;
432  const auto offsets_ptr = a_bins.offsetsPtr();
433 
434  const std::size_t shared_mem_bytes = npts*sizeof(amrex::Real);
435  const amrex::IntVect bin_size = WarpX::shared_tilesize;
436  const std::size_t max_shared_mem_bytes = amrex::Gpu::Device::sharedMemPerBlock();
437  WARPX_ALWAYS_ASSERT_WITH_MESSAGE(shared_mem_bytes <= max_shared_mem_bytes,
438  "Tile size too big for GPU shared memory current deposition");
439 
440  amrex::ignore_unused(np_to_depose);
441  // Launch one thread-block per bin
443  nblocks, threads_per_block, shared_mem_bytes, amrex::Gpu::gpuStream(),
444  [=] AMREX_GPU_DEVICE () noexcept {
445 #if defined(WARPX_USE_GPUCLOCK)
446  const auto KernelTimer = ablastr::parallelization::KernelTimer(
447  cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
448  cost_real);
449 #endif
450  const int bin_id = blockIdx.x;
451  const unsigned int bin_start = offsets_ptr[bin_id];
452  const unsigned int bin_stop = offsets_ptr[bin_id+1];
453 
454  if (bin_start == bin_stop) { return; /*this bin has no particles*/ }
455 
456  // These boxes define the index space for the shared memory buffers
457  amrex::Box buffer_box;
458  {
459  ParticleReal xp, yp, zp;
460  GetPosition(permutation[bin_start], xp, yp, zp);
461 #if defined(WARPX_DIM_3D)
462  IntVect iv = IntVect(int( amrex::Math::floor((xp-plo[0]) * dxiarr[0]) ),
463  int( amrex::Math::floor((yp-plo[1]) * dxiarr[1]) ),
464  int( amrex::Math::floor((zp-plo[2]) * dxiarr[2]) ));
465 #elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
466  IntVect iv = IntVect(int( amrex::Math::floor((xp-plo[0]) * dxiarr[0]) ),
467  int( amrex::Math::floor((zp-plo[1]) * dxiarr[1]) ));
468 #elif defined(WARPX_DIM_1D_Z)
469  IntVect iv = IntVect(int( amrex::Math::floor((zp-plo[0]) * dxiarr[0]) ));
470 #endif
471  iv += domain.smallEnd();
472  getTileIndex(iv, box, true, bin_size, buffer_box);
473  }
474 
475  buffer_box.grow(depos_order);
476  Box tbox_x = convert(buffer_box, jx_type);
477  Box tbox_y = convert(buffer_box, jy_type);
478  Box tbox_z = convert(buffer_box, jz_type);
479 
481  amrex::Real* const shared = gsm.dataPtr();
482 
483  amrex::Array4<amrex::Real> const jx_buff(shared,
484  amrex::begin(tbox_x), amrex::end(tbox_x), 1);
485  amrex::Array4<amrex::Real> const jy_buff(shared,
486  amrex::begin(tbox_y), amrex::end(tbox_y), 1);
487  amrex::Array4<amrex::Real> const jz_buff(shared,
488  amrex::begin(tbox_z), amrex::end(tbox_z), 1);
489 
490  // Zero-initialize the temporary array in shared memory
491  volatile amrex::Real* vs = shared;
492  for (int i = threadIdx.x; i < npts; i += blockDim.x){
493  vs[i] = 0.0;
494  }
495  __syncthreads();
496  for (unsigned int ip_orig = bin_start+threadIdx.x; ip_orig<bin_stop; ip_orig += blockDim.x)
497  {
498  const unsigned int ip = permutation[ip_orig];
499  depositComponent<depos_order>(GetPosition, wp, uxp, uyp, uzp, ion_lev, jx_buff, jx_type,
500  relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes,
501  ip, zdir, NODE, CELL, 0);
502  }
503 
504  __syncthreads();
505  addLocalToGlobal(tbox_x, jx_arr, jx_buff);
506  for (int i = threadIdx.x; i < npts; i += blockDim.x){
507  vs[i] = 0.0;
508  }
509 
510  __syncthreads();
511  for (unsigned int ip_orig = bin_start+threadIdx.x; ip_orig<bin_stop; ip_orig += blockDim.x)
512  {
513  const unsigned int ip = permutation[ip_orig];
514  depositComponent<depos_order>(GetPosition, wp, uxp, uyp, uzp, ion_lev, jy_buff, jy_type,
515  relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes,
516  ip, zdir, NODE, CELL, 1);
517  }
518 
519  __syncthreads();
520  addLocalToGlobal(tbox_y, jy_arr, jy_buff);
521  for (int i = threadIdx.x; i < npts; i += blockDim.x){
522  vs[i] = 0.0;
523  }
524 
525  __syncthreads();
526  for (unsigned int ip_orig = bin_start+threadIdx.x; ip_orig<bin_stop; ip_orig += blockDim.x)
527  {
528  const unsigned int ip = permutation[ip_orig];
529  depositComponent<depos_order>(GetPosition, wp, uxp, uyp, uzp, ion_lev, jz_buff, jz_type,
530  relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes,
531  ip, zdir, NODE, CELL, 2);
532  }
533 
534  __syncthreads();
535  addLocalToGlobal(tbox_z, jz_arr, jz_buff);
536  });
537 #if defined(WARPX_USE_GPUCLOCK)
538  if (cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
540  *cost += *cost_real;
541  amrex::The_Managed_Arena()->free(cost_real);
542  }
543 #endif
544 #else // not using hip/cuda
545  // Note, you should never reach this part of the code. This funcion cannot be called unless
546  // using HIP/CUDA, and those things are checked prior
547  //don't use any args
548  ignore_unused( GetPosition, wp, uxp, uyp, uzp, ion_lev, jx_fab, jy_fab, jz_fab, np_to_depose, relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes, cost, load_balance_costs_update_algo, a_bins, box, geom, a_tbox_max_size);
549  WARPX_ABORT_WITH_MESSAGE("Shared memory only implemented for HIP/CUDA");
550 #endif
551 }
552 
579 template <int depos_order>
581  const amrex::ParticleReal * const wp,
582  const amrex::ParticleReal * const uxp,
583  const amrex::ParticleReal * const uyp,
584  const amrex::ParticleReal * const uzp,
585  const int* ion_lev,
586  const amrex::Array4<amrex::Real>& Jx_arr,
587  const amrex::Array4<amrex::Real>& Jy_arr,
588  const amrex::Array4<amrex::Real>& Jz_arr,
589  long np_to_depose,
590  amrex::Real dt,
591  amrex::Real relative_time,
592  const std::array<amrex::Real,3>& dx,
593  std::array<amrex::Real, 3> xyzmin,
594  amrex::Dim3 lo,
595  amrex::Real q,
596  int n_rz_azimuthal_modes,
597  amrex::Real * const cost,
598  long load_balance_costs_update_algo)
599 {
600  using namespace amrex;
601  using namespace amrex::literals;
602 
603 #if !defined(WARPX_DIM_RZ)
604  ignore_unused(n_rz_azimuthal_modes);
605 #endif
606 
607 #if !defined(AMREX_USE_GPU)
608  amrex::ignore_unused(cost, load_balance_costs_update_algo);
609 #endif
610 
611  // Whether ion_lev is a null pointer (do_ionization=0) or a real pointer
612  // (do_ionization=1)
613  bool const do_ionization = ion_lev;
614 #if !defined(WARPX_DIM_1D_Z)
615  Real const dxi = 1.0_rt / dx[0];
616 #endif
617 #if !defined(WARPX_DIM_1D_Z)
618  Real const xmin = xyzmin[0];
619 #endif
620 #if defined(WARPX_DIM_3D)
621  Real const dyi = 1.0_rt / dx[1];
622  Real const ymin = xyzmin[1];
623 #endif
624  Real const dzi = 1.0_rt / dx[2];
625  Real const zmin = xyzmin[2];
626 
627 #if defined(WARPX_DIM_3D)
628  Real const invdtdx = 1.0_rt / (dt*dx[1]*dx[2]);
629  Real const invdtdy = 1.0_rt / (dt*dx[0]*dx[2]);
630  Real const invdtdz = 1.0_rt / (dt*dx[0]*dx[1]);
631 #elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
632  Real const invdtdx = 1.0_rt / (dt*dx[2]);
633  Real const invdtdz = 1.0_rt / (dt*dx[0]);
634  Real const invvol = 1.0_rt / (dx[0]*dx[2]);
635 #elif defined(WARPX_DIM_1D_Z)
636  Real const invdtdz = 1.0_rt / (dt*dx[0]);
637  Real const invvol = 1.0_rt / (dx[2]);
638 #endif
639 
640 #if defined(WARPX_DIM_RZ)
641  Complex const I = Complex{0._rt, 1._rt};
642 #endif
643 
644  Real const clightsq = 1.0_rt / ( PhysConst::c * PhysConst::c );
645 #if !defined(WARPX_DIM_1D_Z)
646  Real constexpr one_third = 1.0_rt / 3.0_rt;
647  Real constexpr one_sixth = 1.0_rt / 6.0_rt;
648 #endif
649 
650  // Loop over particles and deposit into Jx_arr, Jy_arr and Jz_arr
651 #if defined(WARPX_USE_GPUCLOCK)
652  amrex::Real* cost_real = nullptr;
653  if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
654  cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
655  *cost_real = 0._rt;
656  }
657 #endif
659  np_to_depose,
660  [=] AMREX_GPU_DEVICE (long const ip) {
661 #if defined(WARPX_USE_GPUCLOCK)
662  const auto KernelTimer = ablastr::parallelization::KernelTimer(
663  cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
664  cost_real);
665 #endif
666 
667  // --- Get particle quantities
668  Real const gaminv = 1.0_rt/std::sqrt(1.0_rt + uxp[ip]*uxp[ip]*clightsq
669  + uyp[ip]*uyp[ip]*clightsq
670  + uzp[ip]*uzp[ip]*clightsq);
671 
672  // wqx, wqy wqz are particle current in each direction
673  Real wq = q*wp[ip];
674  if (do_ionization){
675  wq *= ion_lev[ip];
676  }
677 
678  ParticleReal xp, yp, zp;
679  GetPosition(ip, xp, yp, zp);
680 
681 #if !defined(WARPX_DIM_1D_Z)
682  Real const wqx = wq*invdtdx;
683 #endif
684 #if defined(WARPX_DIM_3D)
685  Real const wqy = wq*invdtdy;
686 #endif
687  Real const wqz = wq*invdtdz;
688 
689  // computes current and old position in grid units
690 #if defined(WARPX_DIM_RZ)
691  Real const xp_new = xp + (relative_time + 0.5_rt*dt)*uxp[ip]*gaminv;
692  Real const yp_new = yp + (relative_time + 0.5_rt*dt)*uyp[ip]*gaminv;
693  Real const xp_mid = xp_new - 0.5_rt*dt*uxp[ip]*gaminv;
694  Real const yp_mid = yp_new - 0.5_rt*dt*uyp[ip]*gaminv;
695  Real const xp_old = xp_new - dt*uxp[ip]*gaminv;
696  Real const yp_old = yp_new - dt*uyp[ip]*gaminv;
697  Real const rp_new = std::sqrt(xp_new*xp_new + yp_new*yp_new);
698  Real const rp_mid = std::sqrt(xp_mid*xp_mid + yp_mid*yp_mid);
699  Real const rp_old = std::sqrt(xp_old*xp_old + yp_old*yp_old);
700  Real costheta_new, sintheta_new;
701  if (rp_new > 0._rt) {
702  costheta_new = xp_new/rp_new;
703  sintheta_new = yp_new/rp_new;
704  } else {
705  costheta_new = 1._rt;
706  sintheta_new = 0._rt;
707  }
708  amrex::Real costheta_mid, sintheta_mid;
709  if (rp_mid > 0._rt) {
710  costheta_mid = xp_mid/rp_mid;
711  sintheta_mid = yp_mid/rp_mid;
712  } else {
713  costheta_mid = 1._rt;
714  sintheta_mid = 0._rt;
715  }
716  amrex::Real costheta_old, sintheta_old;
717  if (rp_old > 0._rt) {
718  costheta_old = xp_old/rp_old;
719  sintheta_old = yp_old/rp_old;
720  } else {
721  costheta_old = 1._rt;
722  sintheta_old = 0._rt;
723  }
724  const Complex xy_new0 = Complex{costheta_new, sintheta_new};
725  const Complex xy_mid0 = Complex{costheta_mid, sintheta_mid};
726  const Complex xy_old0 = Complex{costheta_old, sintheta_old};
727  // Keep these double to avoid bug in single precision
728  double const x_new = (rp_new - xmin)*dxi;
729  double const x_old = (rp_old - xmin)*dxi;
730 #else
731 #if !defined(WARPX_DIM_1D_Z)
732  // Keep these double to avoid bug in single precision
733  double const x_new = (xp - xmin + (relative_time + 0.5_rt*dt)*uxp[ip]*gaminv)*dxi;
734  double const x_old = x_new - dt*dxi*uxp[ip]*gaminv;
735 #endif
736 #endif
737 #if defined(WARPX_DIM_3D)
738  // Keep these double to avoid bug in single precision
739  double const y_new = (yp - ymin + (relative_time + 0.5_rt*dt)*uyp[ip]*gaminv)*dyi;
740  double const y_old = y_new - dt*dyi*uyp[ip]*gaminv;
741 #endif
742  // Keep these double to avoid bug in single precision
743  double const z_new = (zp - zmin + (relative_time + 0.5_rt*dt)*uzp[ip]*gaminv)*dzi;
744  double const z_old = z_new - dt*dzi*uzp[ip]*gaminv;
745 
746 #if defined(WARPX_DIM_RZ)
747  Real const vy = (-uxp[ip]*sintheta_mid + uyp[ip]*costheta_mid)*gaminv;
748 #elif defined(WARPX_DIM_XZ)
749  Real const vy = uyp[ip]*gaminv;
750 #elif defined(WARPX_DIM_1D_Z)
751  Real const vx = uxp[ip]*gaminv;
752  Real const vy = uyp[ip]*gaminv;
753 #endif
754 
755  // Shape factor arrays
756  // Note that there are extra values above and below
757  // to possibly hold the factor for the old particle
758  // which can be at a different grid location.
759  // Keep these double to avoid bug in single precision
760 #if !defined(WARPX_DIM_1D_Z)
761  double sx_new[depos_order + 3] = {0.};
762  double sx_old[depos_order + 3] = {0.};
763 #endif
764 #if defined(WARPX_DIM_3D)
765  // Keep these double to avoid bug in single precision
766  double sy_new[depos_order + 3] = {0.};
767  double sy_old[depos_order + 3] = {0.};
768 #endif
769  // Keep these double to avoid bug in single precision
770  double sz_new[depos_order + 3] = {0.};
771  double sz_old[depos_order + 3] = {0.};
772 
773  // --- Compute shape factors
774  // Compute shape factors for position as they are now and at old positions
775  // [ijk]_new: leftmost grid point that the particle touches
776  const Compute_shape_factor< depos_order > compute_shape_factor;
777  const Compute_shifted_shape_factor< depos_order > compute_shifted_shape_factor;
778 
779 #if !defined(WARPX_DIM_1D_Z)
780  const int i_new = compute_shape_factor(sx_new+1, x_new);
781  const int i_old = compute_shifted_shape_factor(sx_old, x_old, i_new);
782 #endif
783 #if defined(WARPX_DIM_3D)
784  const int j_new = compute_shape_factor(sy_new+1, y_new);
785  const int j_old = compute_shifted_shape_factor(sy_old, y_old, j_new);
786 #endif
787  const int k_new = compute_shape_factor(sz_new+1, z_new);
788  const int k_old = compute_shifted_shape_factor(sz_old, z_old, k_new);
789 
790  // computes min/max positions of current contributions
791 #if !defined(WARPX_DIM_1D_Z)
792  int dil = 1, diu = 1;
793  if (i_old < i_new) dil = 0;
794  if (i_old > i_new) diu = 0;
795 #endif
796 #if defined(WARPX_DIM_3D)
797  int djl = 1, dju = 1;
798  if (j_old < j_new) djl = 0;
799  if (j_old > j_new) dju = 0;
800 #endif
801  int dkl = 1, dku = 1;
802  if (k_old < k_new) dkl = 0;
803  if (k_old > k_new) dku = 0;
804 
805 #if defined(WARPX_DIM_3D)
806 
807  for (int k=dkl; k<=depos_order+2-dku; k++) {
808  for (int j=djl; j<=depos_order+2-dju; j++) {
809  amrex::Real sdxi = 0._rt;
810  for (int i=dil; i<=depos_order+1-diu; i++) {
811  sdxi += wqx*(sx_old[i] - sx_new[i])*(
812  one_third*(sy_new[j]*sz_new[k] + sy_old[j]*sz_old[k])
813  +one_sixth*(sy_new[j]*sz_old[k] + sy_old[j]*sz_new[k]));
814  amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdxi);
815  }
816  }
817  }
818  for (int k=dkl; k<=depos_order+2-dku; k++) {
819  for (int i=dil; i<=depos_order+2-diu; i++) {
820  amrex::Real sdyj = 0._rt;
821  for (int j=djl; j<=depos_order+1-dju; j++) {
822  sdyj += wqy*(sy_old[j] - sy_new[j])*(
823  one_third*(sx_new[i]*sz_new[k] + sx_old[i]*sz_old[k])
824  +one_sixth*(sx_new[i]*sz_old[k] + sx_old[i]*sz_new[k]));
825  amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdyj);
826  }
827  }
828  }
829  for (int j=djl; j<=depos_order+2-dju; j++) {
830  for (int i=dil; i<=depos_order+2-diu; i++) {
831  amrex::Real sdzk = 0._rt;
832  for (int k=dkl; k<=depos_order+1-dku; k++) {
833  sdzk += wqz*(sz_old[k] - sz_new[k])*(
834  one_third*(sx_new[i]*sy_new[j] + sx_old[i]*sy_old[j])
835  +one_sixth*(sx_new[i]*sy_old[j] + sx_old[i]*sy_new[j]));
836  amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdzk);
837  }
838  }
839  }
840 
841 #elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
842 
843  for (int k=dkl; k<=depos_order+2-dku; k++) {
844  amrex::Real sdxi = 0._rt;
845  for (int i=dil; i<=depos_order+1-diu; i++) {
846  sdxi += wqx*(sx_old[i] - sx_new[i])*0.5_rt*(sz_new[k] + sz_old[k]);
847  amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 0), sdxi);
848 #if defined(WARPX_DIM_RZ)
849  Complex xy_mid = xy_mid0; // Throughout the following loop, xy_mid takes the value e^{i m theta}
850  for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
851  // The factor 2 comes from the normalization of the modes
852  const Complex djr_cmplx = 2._rt *sdxi*xy_mid;
853  amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode-1), djr_cmplx.real());
854  amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode), djr_cmplx.imag());
855  xy_mid = xy_mid*xy_mid0;
856  }
857 #endif
858  }
859  }
860  for (int k=dkl; k<=depos_order+2-dku; k++) {
861  for (int i=dil; i<=depos_order+2-diu; i++) {
862  Real const sdyj = wq*vy*invvol*(
863  one_third*(sx_new[i]*sz_new[k] + sx_old[i]*sz_old[k])
864  +one_sixth*(sx_new[i]*sz_old[k] + sx_old[i]*sz_new[k]));
865  amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 0), sdyj);
866 #if defined(WARPX_DIM_RZ)
867  Complex xy_new = xy_new0;
868  Complex xy_mid = xy_mid0;
869  Complex xy_old = xy_old0;
870  // Throughout the following loop, xy_ takes the value e^{i m theta_}
871  for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
872  // The factor 2 comes from the normalization of the modes
873  // The minus sign comes from the different convention with respect to Davidson et al.
874  const Complex djt_cmplx = -2._rt * I*(i_new-1 + i + xmin*dxi)*wq*invdtdx/(amrex::Real)imode
875  *(Complex(sx_new[i]*sz_new[k], 0._rt)*(xy_new - xy_mid)
876  + Complex(sx_old[i]*sz_old[k], 0._rt)*(xy_mid - xy_old));
877  amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode-1), djt_cmplx.real());
878  amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode), djt_cmplx.imag());
879  xy_new = xy_new*xy_new0;
880  xy_mid = xy_mid*xy_mid0;
881  xy_old = xy_old*xy_old0;
882  }
883 #endif
884  }
885  }
886  for (int i=dil; i<=depos_order+2-diu; i++) {
887  Real sdzk = 0._rt;
888  for (int k=dkl; k<=depos_order+1-dku; k++) {
889  sdzk += wqz*(sz_old[k] - sz_new[k])*0.5_rt*(sx_new[i] + sx_old[i]);
890  amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 0), sdzk);
891 #if defined(WARPX_DIM_RZ)
892  Complex xy_mid = xy_mid0; // Throughout the following loop, xy_mid takes the value e^{i m theta}
893  for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
894  // The factor 2 comes from the normalization of the modes
895  const Complex djz_cmplx = 2._rt * sdzk * xy_mid;
896  amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode-1), djz_cmplx.real());
897  amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode), djz_cmplx.imag());
898  xy_mid = xy_mid*xy_mid0;
899  }
900 #endif
901  }
902  }
903 #elif defined(WARPX_DIM_1D_Z)
904 
905  for (int k=dkl; k<=depos_order+2-dku; k++) {
906  amrex::Real const sdxi = wq*vx*invvol*0.5_rt*(sz_old[k] + sz_new[k]);
907  amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+k_new-1+k, 0, 0, 0), sdxi);
908  }
909  for (int k=dkl; k<=depos_order+2-dku; k++) {
910  amrex::Real const sdyj = wq*vy*invvol*0.5_rt*(sz_old[k] + sz_new[k]);
911  amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+k_new-1+k, 0, 0, 0), sdyj);
912  }
913  amrex::Real sdzk = 0._rt;
914  for (int k=dkl; k<=depos_order+1-dku; k++) {
915  sdzk += wqz*(sz_old[k] - sz_new[k]);
916  amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+k_new-1+k, 0, 0, 0), sdzk);
917  }
918 #endif
919  }
920  );
921 #if defined(WARPX_USE_GPUCLOCK)
922  if(cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
924  *cost += *cost_real;
925  amrex::The_Managed_Arena()->free(cost_real);
926  }
927 #endif
928 }
929 
959 template <int depos_order>
961  const amrex::ParticleReal* const wp,
962  const amrex::ParticleReal* const uxp,
963  const amrex::ParticleReal* const uyp,
964  const amrex::ParticleReal* const uzp,
965  const int* const ion_lev,
966  amrex::FArrayBox& Dx_fab,
967  amrex::FArrayBox& Dy_fab,
968  amrex::FArrayBox& Dz_fab,
969  long np_to_depose,
970  amrex::Real dt,
971  amrex::Real relative_time,
972  const std::array<amrex::Real,3>& dx,
973  const std::array<amrex::Real,3>& xyzmin,
974  amrex::Dim3 lo,
975  amrex::Real q,
976  int n_rz_azimuthal_modes,
977  amrex::Real* cost,
978  long load_balance_costs_update_algo)
979 {
980  using namespace amrex::literals;
981 
982 #if defined(WARPX_DIM_RZ)
983  amrex::ignore_unused(GetPosition,
984  wp, uxp, uyp, uzp, ion_lev, Dx_fab, Dy_fab, Dz_fab,
985  np_to_depose, dt, relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes);
986  WARPX_ABORT_WITH_MESSAGE("Vay deposition not implemented in RZ geometry");
987 #endif
988 
989 #if defined(WARPX_DIM_1D_Z)
990  amrex::ignore_unused(GetPosition,
991  wp, uxp, uyp, uzp, ion_lev, Dx_fab, Dy_fab, Dz_fab,
992  np_to_depose, dt, relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes);
993  WARPX_ABORT_WITH_MESSAGE("Vay deposition not implemented in cartesian 1D geometry");
994 #endif
995 
996 #if !defined(WARPX_USE_GPUCLOCK)
997  amrex::ignore_unused(cost, load_balance_costs_update_algo);
998 #endif
999 
1000 #if !(defined WARPX_DIM_RZ || defined WARPX_DIM_1D_Z)
1001  amrex::ignore_unused(n_rz_azimuthal_modes);
1002 
1003  // If ion_lev is a null pointer, then do_ionization=0, else do_ionization=1
1004  const bool do_ionization = ion_lev;
1005 
1006  // Inverse cell volume in each direction
1007  const amrex::Real dxi = 1._rt / dx[0];
1008  const amrex::Real dzi = 1._rt / dx[2];
1009 #if defined(WARPX_DIM_3D)
1010  const amrex::Real dyi = 1._rt / dx[1];
1011 #endif
1012 
1013  // Inverse of time step
1014  const amrex::Real invdt = 1._rt / dt;
1015 
1016  // Total inverse cell volume
1017 #if defined(WARPX_DIM_XZ)
1018  const amrex::Real invvol = dxi * dzi;
1019 #elif defined(WARPX_DIM_3D)
1020  const amrex::Real invvol = dxi * dyi * dzi;
1021 #endif
1022 
1023  // Lower bound of physical domain in each direction
1024  const amrex::Real xmin = xyzmin[0];
1025  const amrex::Real zmin = xyzmin[2];
1026 #if defined(WARPX_DIM_3D)
1027  const amrex::Real ymin = xyzmin[1];
1028 #endif
1029 
1030  // Allocate temporary arrays
1031 #if defined(WARPX_DIM_3D)
1032  AMREX_ALWAYS_ASSERT(Dx_fab.box() == Dy_fab.box() && Dx_fab.box() == Dz_fab.box());
1033  amrex::FArrayBox temp_fab{Dx_fab.box(), 4};
1034 #elif defined(WARPX_DIM_XZ)
1035  AMREX_ALWAYS_ASSERT(Dx_fab.box() == Dz_fab.box());
1036  amrex::FArrayBox temp_fab{Dx_fab.box(), 2};
1037 #endif
1038  temp_fab.setVal<amrex::RunOn::Device>(0._rt);
1039  amrex::Array4<amrex::Real> const& temp_arr = temp_fab.array();
1040 
1041  // Inverse of light speed squared
1042  const amrex::Real invcsq = 1._rt / (PhysConst::c * PhysConst::c);
1043 
1044  // Arrays where D will be stored
1045  amrex::Array4<amrex::Real> const& Dx_arr = Dx_fab.array();
1046  amrex::Array4<amrex::Real> const& Dy_arr = Dy_fab.array();
1047  amrex::Array4<amrex::Real> const& Dz_arr = Dz_fab.array();
1048 
1049  // Loop over particles and deposit (Dx,Dy,Dz) into Dx_fab, Dy_fab and Dz_fab
1050 #if defined(WARPX_USE_GPUCLOCK)
1051  amrex::Real* cost_real = nullptr;
1052  if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
1053  cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
1054  *cost_real = 0._rt;
1055  }
1056 #endif
1057  amrex::ParallelFor(np_to_depose, [=] AMREX_GPU_DEVICE (long ip)
1058  {
1059 #if defined(WARPX_USE_GPUCLOCK)
1060  const auto KernelTimer = ablastr::parallelization::KernelTimer(
1061  cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
1062  cost_real);
1063 #endif
1064 
1065  // Inverse of Lorentz factor gamma
1066  const amrex::Real invgam = 1._rt / std::sqrt(1._rt + uxp[ip] * uxp[ip] * invcsq
1067  + uyp[ip] * uyp[ip] * invcsq
1068  + uzp[ip] * uzp[ip] * invcsq);
1069  // Product of particle charges and weights
1070  amrex::Real wq = q * wp[ip];
1071  if (do_ionization) wq *= ion_lev[ip];
1072 
1073  // Current particle positions (in physical units)
1074  amrex::ParticleReal xp, yp, zp;
1075  GetPosition(ip, xp, yp, zp);
1076 
1077  // Particle velocities
1078  const amrex::Real vx = uxp[ip] * invgam;
1079  const amrex::Real vy = uyp[ip] * invgam;
1080  const amrex::Real vz = uzp[ip] * invgam;
1081 
1082  // Modify the particle position to match the time of the deposition
1083  xp += relative_time * vx;
1084  yp += relative_time * vy;
1085  zp += relative_time * vz;
1086 
1087  // Particle current densities
1088 #if defined(WARPX_DIM_XZ)
1089  const amrex::Real wqy = wq * vy * invvol;
1090 #endif
1091 
1092  // Current and old particle positions in grid units
1093  // Keep these double to avoid bug in single precision.
1094  double const x_new = (xp - xmin + 0.5_rt*dt*vx) * dxi;
1095  double const x_old = (xp - xmin - 0.5_rt*dt*vx) * dxi;
1096 #if defined(WARPX_DIM_3D)
1097  // Keep these double to avoid bug in single precision.
1098  double const y_new = (yp - ymin + 0.5_rt*dt*vy) * dyi;
1099  double const y_old = (yp - ymin - 0.5_rt*dt*vy) * dyi;
1100 #endif
1101  // Keep these double to avoid bug in single precision.
1102  double const z_new = (zp - zmin + 0.5_rt*dt*vz) * dzi;
1103  double const z_old = (zp - zmin - 0.5_rt*dt*vz) * dzi;
1104 
1105  // Shape factor arrays for current and old positions (nodal)
1106  // Keep these double to avoid bug in single precision.
1107  double sx_new[depos_order+1] = {0.};
1108  double sx_old[depos_order+1] = {0.};
1109 #if defined(WARPX_DIM_3D)
1110  // Keep these double to avoid bug in single precision.
1111  double sy_new[depos_order+1] = {0.};
1112  double sy_old[depos_order+1] = {0.};
1113 #endif
1114  // Keep these double to avoid bug in single precision.
1115  double sz_new[depos_order+1] = {0.};
1116  double sz_old[depos_order+1] = {0.};
1117 
1118  // Compute shape factors for current positions
1119 
1120  // i_new leftmost grid point in x that the particle touches
1121  // sx_new shape factor along x for the centering of each current
1122  Compute_shape_factor< depos_order > const compute_shape_factor;
1123  const int i_new = compute_shape_factor(sx_new, x_new);
1124 #if defined(WARPX_DIM_3D)
1125  // j_new leftmost grid point in y that the particle touches
1126  // sy_new shape factor along y for the centering of each current
1127  const int j_new = compute_shape_factor(sy_new, y_new);
1128 #endif
1129  // k_new leftmost grid point in z that the particle touches
1130  // sz_new shape factor along z for the centering of each current
1131  const int k_new = compute_shape_factor(sz_new, z_new);
1132 
1133  // Compute shape factors for old positions
1134 
1135  // i_old leftmost grid point in x that the particle touches
1136  // sx_old shape factor along x for the centering of each current
1137  const int i_old = compute_shape_factor(sx_old, x_old);
1138 #if defined(WARPX_DIM_3D)
1139  // j_old leftmost grid point in y that the particle touches
1140  // sy_old shape factor along y for the centering of each current
1141  const int j_old = compute_shape_factor(sy_old, y_old);
1142 #endif
1143  // k_old leftmost grid point in z that the particle touches
1144  // sz_old shape factor along z for the centering of each current
1145  const int k_old = compute_shape_factor(sz_old, z_old);
1146 
1147  // Deposit current into Dx_arr, Dy_arr and Dz_arr
1148 #if defined(WARPX_DIM_XZ)
1149 
1150  for (int k=0; k<=depos_order; k++) {
1151  for (int i=0; i<=depos_order; i++) {
1152 
1153  // Re-casting sx_new and sz_new from double to amrex::Real so that
1154  // Atomic::Add has consistent types in its argument
1155  auto const sxn_szn = static_cast<amrex::Real>(sx_new[i] * sz_new[k]);
1156  auto const sxo_szn = static_cast<amrex::Real>(sx_old[i] * sz_new[k]);
1157  auto const sxn_szo = static_cast<amrex::Real>(sx_new[i] * sz_old[k]);
1158  auto const sxo_szo = static_cast<amrex::Real>(sx_old[i] * sz_old[k]);
1159 
1160  if (i_new == i_old && k_new == k_old) {
1161  // temp arrays for Dx and Dz
1162  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + k_new + k, 0, 0),
1163  wq * invvol * invdt * (sxn_szn - sxo_szo));
1164 
1165  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + k_new + k, 0, 1),
1166  wq * invvol * invdt * (sxn_szo - sxo_szn));
1167 
1168  // Dy
1169  amrex::Gpu::Atomic::AddNoRet(&Dy_arr(lo.x + i_new + i, lo.y + k_new + k, 0, 0),
1170  wqy * 0.25_rt * (sxn_szn + sxn_szo + sxo_szn + sxo_szo));
1171  } else {
1172  // temp arrays for Dx and Dz
1173  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + k_new + k, 0, 0),
1174  wq * invvol * invdt * sxn_szn);
1175 
1176  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + k_old + k, 0, 0),
1177  - wq * invvol * invdt * sxo_szo);
1178 
1179  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + k_old + k, 0, 1),
1180  wq * invvol * invdt * sxn_szo);
1181 
1182  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + k_new + k, 0, 1),
1183  - wq * invvol * invdt * sxo_szn);
1184 
1185  // Dy
1186  amrex::Gpu::Atomic::AddNoRet(&Dy_arr(lo.x + i_new + i, lo.y + k_new + k, 0, 0),
1187  wqy * 0.25_rt * sxn_szn);
1188 
1189  amrex::Gpu::Atomic::AddNoRet(&Dy_arr(lo.x + i_new + i, lo.y + k_old + k, 0, 0),
1190  wqy * 0.25_rt * sxn_szo);
1191 
1192  amrex::Gpu::Atomic::AddNoRet(&Dy_arr(lo.x + i_old + i, lo.y + k_new + k, 0, 0),
1193  wqy * 0.25_rt * sxo_szn);
1194 
1195  amrex::Gpu::Atomic::AddNoRet(&Dy_arr(lo.x + i_old + i, lo.y + k_old + k, 0, 0),
1196  wqy * 0.25_rt * sxo_szo);
1197  }
1198 
1199  }
1200  }
1201 
1202 #elif defined(WARPX_DIM_3D)
1203 
1204  for (int k=0; k<=depos_order; k++) {
1205  for (int j=0; j<=depos_order; j++) {
1206 
1207  auto const syn_szn = static_cast<amrex::Real>(sy_new[j] * sz_new[k]);
1208  auto const syo_szn = static_cast<amrex::Real>(sy_old[j] * sz_new[k]);
1209  auto const syn_szo = static_cast<amrex::Real>(sy_new[j] * sz_old[k]);
1210  auto const syo_szo = static_cast<amrex::Real>(sy_old[j] * sz_old[k]);
1211 
1212  for (int i=0; i<=depos_order; i++) {
1213 
1214  auto const sxn_syn_szn = static_cast<amrex::Real>(sx_new[i]) * syn_szn;
1215  auto const sxo_syn_szn = static_cast<amrex::Real>(sx_old[i]) * syn_szn;
1216  auto const sxn_syo_szn = static_cast<amrex::Real>(sx_new[i]) * syo_szn;
1217  auto const sxo_syo_szn = static_cast<amrex::Real>(sx_old[i]) * syo_szn;
1218  auto const sxn_syn_szo = static_cast<amrex::Real>(sx_new[i]) * syn_szo;
1219  auto const sxo_syn_szo = static_cast<amrex::Real>(sx_old[i]) * syn_szo;
1220  auto const sxn_syo_szo = static_cast<amrex::Real>(sx_new[i]) * syo_szo;
1221  auto const sxo_syo_szo = static_cast<amrex::Real>(sx_old[i]) * syo_szo;
1222 
1223  if (i_new == i_old && j_new == j_old && k_new == k_old) {
1224  // temp arrays for Dx, Dy and Dz
1225  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_new + k, 0),
1226  wq * invvol * invdt * (sxn_syn_szn - sxo_syo_szo));
1227 
1228  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_new + k, 1),
1229  wq * invvol * invdt * (sxn_syn_szo - sxo_syo_szn));
1230 
1231  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_new + k, 2),
1232  wq * invvol * invdt * (sxn_syo_szn - sxo_syn_szo));
1233 
1234  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_new + k, 3),
1235  wq * invvol * invdt * (sxo_syn_szn - sxn_syo_szo));
1236  } else {
1237  // temp arrays for Dx, Dy and Dz
1238  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_new + k, 0),
1239  wq * invvol * invdt * sxn_syn_szn);
1240 
1241  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + j_old + j, lo.z + k_old + k, 0),
1242  - wq * invvol * invdt * sxo_syo_szo);
1243 
1244  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_old + k, 1),
1245  wq * invvol * invdt * sxn_syn_szo);
1246 
1247  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + j_old + j, lo.z + k_new + k, 1),
1248  - wq * invvol * invdt * sxo_syo_szn);
1249 
1250  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_old + j, lo.z + k_new + k, 2),
1251  wq * invvol * invdt * sxn_syo_szn);
1252 
1253  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + j_new + j, lo.z + k_old + k, 2),
1254  - wq * invvol * invdt * sxo_syn_szo);
1255 
1256  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + j_new + j, lo.z + k_new + k, 3),
1257  wq * invvol * invdt * sxo_syn_szn);
1258 
1259  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_old + j, lo.z + k_old + k, 3),
1260  - wq * invvol * invdt * sxn_syo_szo);
1261  }
1262  }
1263  }
1264  }
1265 #endif
1266  } );
1267 
1268 #if defined(WARPX_DIM_3D)
1269  amrex::ParallelFor(Dx_fab.box(), [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
1270  {
1271  const amrex::Real t_a = temp_arr(i,j,k,0);
1272  const amrex::Real t_b = temp_arr(i,j,k,1);
1273  const amrex::Real t_c = temp_arr(i,j,k,2);
1274  const amrex::Real t_d = temp_arr(i,j,k,3);
1275  Dx_arr(i,j,k) += (1._rt/6._rt)*(2_rt*t_a + t_b + t_c - 2._rt*t_d);
1276  Dy_arr(i,j,k) += (1._rt/6._rt)*(2_rt*t_a + t_b - 2._rt*t_c + t_d);
1277  Dz_arr(i,j,k) += (1._rt/6._rt)*(2_rt*t_a - 2._rt*t_b + t_c + t_d);
1278  });
1279 #elif defined(WARPX_DIM_XZ)
1280  amrex::ParallelFor(Dx_fab.box(), [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept
1281  {
1282  const amrex::Real t_a = temp_arr(i,j,0,0);
1283  const amrex::Real t_b = temp_arr(i,j,0,1);
1284  Dx_arr(i,j,0) += (0.5_rt)*(t_a + t_b);
1285  Dz_arr(i,j,0) += (0.5_rt)*(t_a - t_b);
1286  });
1287 #endif
1288  // Synchronize so that temp_fab can be safely deallocated in its destructor
1290 
1291 
1292 # if defined(WARPX_USE_GPUCLOCK)
1293  if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
1295  *cost += *cost_real;
1296  amrex::The_Managed_Arena()->free(cost_real);
1297  }
1298 # endif
1299 #endif // #if !(defined WARPX_DIM_RZ || defined WARPX_DIM_1D_Z)
1300 }
1301 #endif // CURRENTDEPOSITION_H_
#define AMREX_ALWAYS_ASSERT(EX)
#define AMREX_GPU_DEVICE
#define AMREX_D_DECL(a, b, c)
void doDepositionShapeN(const GetParticlePosition< PIdx > &GetPosition, const amrex::ParticleReal *const wp, const amrex::ParticleReal *const uxp, const amrex::ParticleReal *const uyp, const amrex::ParticleReal *const uzp, const int *ion_lev, amrex::FArrayBox &jx_fab, amrex::FArrayBox &jy_fab, amrex::FArrayBox &jz_fab, long np_to_depose, amrex::Real relative_time, const std::array< amrex::Real, 3 > &dx, const std::array< amrex::Real, 3 > &xyzmin, amrex::Dim3 lo, amrex::Real q, int n_rz_azimuthal_modes, amrex::Real *cost, long load_balance_costs_update_algo)
Current Deposition for thread thread_num.
Definition: CurrentDeposition.H:54
void doEsirkepovDepositionShapeN(const GetParticlePosition< PIdx > &GetPosition, const amrex::ParticleReal *const wp, const amrex::ParticleReal *const uxp, const amrex::ParticleReal *const uyp, const amrex::ParticleReal *const uzp, const int *ion_lev, const amrex::Array4< amrex::Real > &Jx_arr, const amrex::Array4< amrex::Real > &Jy_arr, const amrex::Array4< amrex::Real > &Jz_arr, long np_to_depose, amrex::Real dt, amrex::Real relative_time, const std::array< amrex::Real, 3 > &dx, std::array< amrex::Real, 3 > xyzmin, amrex::Dim3 lo, amrex::Real q, int n_rz_azimuthal_modes, amrex::Real *const cost, long load_balance_costs_update_algo)
Esirkepov Current Deposition for thread thread_num.
Definition: CurrentDeposition.H:580
void doVayDepositionShapeN(const GetParticlePosition< PIdx > &GetPosition, const amrex::ParticleReal *const wp, const amrex::ParticleReal *const uxp, const amrex::ParticleReal *const uyp, const amrex::ParticleReal *const uzp, const int *const ion_lev, amrex::FArrayBox &Dx_fab, amrex::FArrayBox &Dy_fab, amrex::FArrayBox &Dz_fab, long np_to_depose, amrex::Real dt, amrex::Real relative_time, const std::array< amrex::Real, 3 > &dx, const std::array< amrex::Real, 3 > &xyzmin, amrex::Dim3 lo, amrex::Real q, int n_rz_azimuthal_modes, amrex::Real *cost, long load_balance_costs_update_algo)
Vay current deposition (Vay et al, 2013) for thread thread_num: deposit D in real space and store the...
Definition: CurrentDeposition.H:960
void doDepositionSharedShapeN(const GetParticlePosition< PIdx > &GetPosition, const amrex::ParticleReal *const wp, const amrex::ParticleReal *const uxp, const amrex::ParticleReal *const uyp, const amrex::ParticleReal *const uzp, const int *ion_lev, amrex::FArrayBox &jx_fab, amrex::FArrayBox &jy_fab, amrex::FArrayBox &jz_fab, long np_to_depose, const amrex::Real relative_time, const std::array< amrex::Real, 3 > &dx, const std::array< amrex::Real, 3 > &xyzmin, amrex::Dim3 lo, amrex::Real q, int n_rz_azimuthal_modes, amrex::Real *cost, long load_balance_costs_update_algo, const amrex::DenseBins< WarpXParticleContainer::ParticleType > &a_bins, const amrex::Box &box, const amrex::Geometry &geom, const amrex::IntVect &a_tbox_max_size)
Current Deposition for thread thread_num using shared memory.
Definition: CurrentDeposition.H:368
#define WARPX_ABORT_WITH_MESSAGE(MSG)
Definition: TextMsg.H:15
#define WARPX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition: TextMsg.H:13
amrex::GpuComplex< amrex::Real > Complex
Definition: WarpX_Complex.H:22
NODE
static amrex::IntVect shared_tilesize
tileSize to use for shared current deposition operations
Definition: WarpX.H:243
static int shared_mem_current_tpb
number of threads to use per block in shared deposition
Definition: WarpX.H:240
Defines a timer object to be used on GPU; measures summed thread cycles.
Definition: KernelTimer.H:27
virtual void free(void *pt)=0
virtual void * alloc(std::size_t sz)=0
AMREX_FORCE_INLINE Array4< Real const > array() const noexcept
const Box & box() const noexcept
AMREX_GPU_HOST_DEVICE Long numPts() const noexcept
AMREX_GPU_HOST_DEVICE Box & grow(int i) noexcept
GpuArray< Real, AMREX_SPACEDIM > InvCellSizeArray() const noexcept
index_type * offsetsPtr() noexcept
index_type * permutationPtr() noexcept
Long numBins() const noexcept
GpuArray< Real, AMREX_SPACEDIM > ProbLoArray() const noexcept
const Box & Domain() const noexcept
static std::size_t sharedMemPerBlock() noexcept
static constexpr auto c
vacuum speed of light [m/s]
Definition: constant.H:44
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
void streamSynchronize() noexcept
gpuStream_t gpuStream() noexcept
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 end(Box const &box) noexcept
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box convert(const Box &b, const IntVect &typ) noexcept
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & max(const T &a, const T &b) noexcept
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 begin(Box const &box) noexcept
void launch(T const &n, L &&f) noexcept
Arena * The_Managed_Arena()
std::enable_if_t< std::is_integral< T >::value > ParallelFor(TypeList< CTOs... >, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int getTileIndex(const IntVect &iv, const Box &box, const bool a_do_tiling, const IntVect &a_tile_size, Box &tbx)
i
Definition: check_interp_points_and_weights.py:174
float dt
Definition: stencil.py:442
tuple dx
lab frame
Definition: stencil.py:429
xmin
Definition: stencil.py:424
Definition: ShapeFactors.H:29
Definition: ShapeFactors.H:84
@ GpuClock
Definition: WarpXAlgorithmSelection.H:125
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE T real() const noexcept
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE T imag() const noexcept
AMREX_GPU_DEVICE T * dataPtr() noexcept