import React, { useState } from "react";
import Modal from "react-modal";

Modal.setAppElement("#root");

const Blog = () => {
  const [isOpen3, setIsOpen3] = useState(false);

  function toggleModalThree() {
    setIsOpen3(!isOpen3);
  }

  return (
    <>
      <div className="row">
        <div
          className="col-lg-4 col-md-6 my-3"
          data-aos="fade-right"
          data-aos-duration="1200"
          data-aos-delay="200"
        >
          <div className="blog-post" onClick={toggleModalThree}>
            <div className="blog-img">
              <div className="data">
                <span>1</span>
                <small>Jan</small>
              </div>

              <img src="img/blog/nvdia.jpg" alt="Cuda Programming"></img>
            </div>
            {/* End blog-img */}

            <div className="blog-info">
              <h6>CUDA C++ Programming</h6>
              <p>
                My takeaways from{" "}
                <a href="https://docs.nvidia.com/cuda/cuda-c-programming-guide/">
                  Nvdia CUDA C++ Programming Guide
                </a>{" "}
                and{" "}
                <a href="https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/">
                  Nvdia CUDA C++ Best Practices Guide
                </a>
                ..
              </p>
              <div className="btn-bar">Read More</div>
            </div>
            {/* End blog-info */}
          </div>
          {/* End blog-post */}
        </div>
        {/* End .col for blog-1 */}
      </div>
      {/* End .row */}

      {/* Start Modal for Blog-3 */}
      <Modal
        isOpen={isOpen3}
        onRequestClose={toggleModalThree}
        contentLabel="My dialog"
        className="custom-modal"
        overlayClassName="custom-overlay"
        closeTimeoutMS={500}
      >
        <div>
          <button className="close-modal" onClick={toggleModalThree}>
            <img src="/img/cancel.svg" alt="close icon" />
          </button>
          {/* End close icon */}
          <div className="box_inner">
            <div className="scrollable">
              <div className="blog-grid">
                {/* End blog-img */}
                <article className="article">
                  <div className="article-title">
                    <h1>CUDA C++ Programming</h1>
                  </div>
                  {/* End .article-title */}

                  <div className="article-content">
                    <p>
                      The CPU is designed to excel at executing a sequence of
                      operations, called a thread, as fast as possible and can
                      execute a few of them in parallel while the GPU is
                      designed to excel at executing thousands of them in
                      parallel. Therefore, more transistors are devoted to data
                      processing rather than data caching and flow control in a
                      GPU, allowing it to hide memory access latencies with
                      computation, instead of relying on large caches and
                      complex data flows, which are expensive. CUDA programming
                      involves running code on two different platforms
                      concurrently: a host system with one or more CPUs and one
                      or more CUDA-enabled NVIDIA GPU devices.
                      <img
                        src="img/blog/transisters.jpg"
                        alt="Example CPU and GPU architecture"
                      ></img>
                    </p>

                    <h2>Programming Model</h2>
                    <h3>Thread Hierarchy</h3>
                    <p>
                      <pre>{`// Kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N],
float C[N][N])
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    if (i < N && j < N)
        C[i][j] = A[i][j] + B[i][j];
}

int main()
{
    ...
    // Kernel invocation
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
    MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
    ...
}`}</pre>
                    </p>
                    <h4>Thread Blocks</h4>
                    <p>
                      All threads of a block are on the same streaming
                      multiprocessor. On current GPUs, a thread block may
                      contain a multiple of 32 threads, up to 1024. A kernel
                      (block of code) can be executed by multiple blocks of the
                      same shape. Block grids can be 1D, 2D or 3D.
                      <img
                        src="img/blog/grid-of-thread-blocks.jpg"
                        alt="Grid of Thread Blocks"
                      ></img>
                      A common choice for block size is 16x16 (256 threads).
                      Thread blocks are executed independently. They can
                      cooperate through shared memory and synchronizing memory
                      accesses. The intrinsic function __syncthreads() acts as a
                      barrier, and it is expected to be lightweight.
                    </p>
                    <h4>Thread Block Clusters</h4>
                    <p>
                      Thread blocks in a cluster are guaranteed to be
                      co-scheduled on a GPU Processing Cluster (GPC) in the GPU.
                      <img
                        src="img/blog/grid-of-clusters.jpg"
                        alt="Grid of Thread Block Clusters"
                      ></img>
                      Block cluster grids can be 1D, 2D or 3D. The maximum
                      number of blocks per cluster is architecture-specific and
                      can be queried using the
                      cudaOccupancyMaxPotentialClusterSize API.
                    </p>
                    <h3>Memory Hierarchy</h3>
                    <p>
                      Global, local, and texture memory have the greatest access
                      latency, followed by constant memory, shared memory, and
                      the register file.
                    </p>
                    <p>
                      <img
                        src="img/blog/memory-hierarchy.jpg"
                        alt="Memory Hierarchy"
                      ></img>
                    </p>
                    <h4>Global Memory</h4>
                    <p>
                      Global memory is allocated and managed by the host. It can
                      also be accessed by both the host and the devices,
                      therefore, can be used to exchange information. It is the
                      main memory space, very large, accessible by all threads
                      in all blocks, but slower than shared memory. Memory
                      allocated on the host and passed as a parameter to a
                      kernel is, by default, allocated in global memory. Global
                      memory is not coherent, meaning that changes made by one
                      thread block may not be immediately visible to other
                      thread blocks. Small CUDA arrays, which size is known at
                      compile time, will also be allocated in registers by the
                      compiler.
                    </p>
                    <h4>Local Memory</h4>
                    <p>
                      Local memory is only visible to the thread allocating it.
                      It is also very slow, but accesses are coalesced. It is
                      often used to store spilled registers. If the size of a
                      CUDA array is not known at compile time, then it will be
                      stored in the local memory instead of the global memory.
                    </p>
                    <h4>Shared Memory</h4>
                    <p>
                      Shared memory is on-chip, making it much faster (100x
                      lower latency than global memory) than global and local
                      memory. All threads in the same block have access to the
                      same shared memory. By default, there is only 48 KB CUDA
                      static (size known at compile time) shared memory
                      available. To have more shared memory, one will have to
                      use dynamic shared memory and specify the size to be
                      allocated with cudaFuncSetAttribute. Check the return
                      value because it may fail. Shared memory is divided into
                      32 equally-sized memory banks that can be accessed
                      simultaneously. Therefore, any load or store that spans n
                      distinct memory banks can be serviced at the same time. A
                      bank conflict occurs when multiple addresses of a memory
                      request are mapped to the same bank, and different threads
                      in the same warp are accessing different addresses of the
                      same bank. The one exception here is when multiple threads
                      in a warp address the same shared memory location,
                      resulting in a broadcast and no bank conflict. Each bank
                      has a bandwidth of 32 bits every clock cycle, and
                      successive 32-bit words are assigned to successive banks.
                      Shared memory bank conflicts are only relevant for threads
                      within a warp, on a particular instruction/cycle. The warp
                      size is 32 threads, same as the number of banks. Design
                      code carefully to avoid bank conflicts.
                    </p>
                    <h4>Constant Memory</h4>
                    <p>
                      The constant memory (typically 64 KB) is used for
                      read-only data that does not change during kernel
                      execution. It is written by the host and can be read by
                      both the host and the devices. The constant memory is
                      cached. When a request comes in, it is broken into two
                      half-wraps. For all threads of a half warp, reading from
                      the constant cache, as long as all threads read the same
                      address, is no slower than reading from a register.
                      However, if threads of the half-warp access different
                      memory locations, the access time scales linearly with the
                      number of different addresses read by all threads within
                      the half-warp. Constant memory is cached, so consecutive
                      reads from the same address do not incur extra costs.
                    </p>
                    <h4>Texture Memory</h4>
                    <p>
                      Texture memory is cached and read-only. Use it when the
                      data is rarely updated, read often, and has good spacial
                      locality.
                    </p>
                    <h3>Best Practices</h3>
                    <ul>
                      <li>
                        Computations should be run on numerous data elements
                        simultaneously in parallel
                      </li>
                      <li>
                        Data transfers between host and devices should be
                        minimized. Data should be kept on devices for as long as
                        possible
                      </li>
                      <li>
                        There should be some coherence in memory access by
                        adjacent threads running on the device to allow the
                        hardware to coalesce groups of reads or writes of
                        multiple data items into one operation
                      </li>
                    </ul>
                    <h2>Other Resources</h2>
                    <a href="https://leimao.github.io/blog/CUDA-Shared-Memory-Bank/">
                      {" "}
                      Lei Mao: CUDA Shared Memory Bank{" "}
                    </a>
                    <p></p>
                  </div>
                  {/* End article content */}
                </article>
                {/* End Article */}
                {/* End .contact Form */}
              </div>
            </div>
          </div>
        </div>
        {/* End modal box news */}
      </Modal>
      {/* End  Modal for Blog-3 */}
    </>
  );
};

export default Blog;
