diff --git a/content/post/202005-cuda-beginner.md b/content/post/202005-cuda-beginner.md
new file mode 100644
index 0000000..ebf6717
--- /dev/null
+++ b/content/post/202005-cuda-beginner.md
@@ -0,0 +1,99 @@
++++
+title = "CUDA 编程入门：GPU 驱动、CUDA 与样例程序"
+date = "2020-05-31"
+description = "从一台旧笔电折腾 nVidia CUDA 编程之旅"
++++
+
+[TOC]
+
+## 缘由
+显卡这东西就目前来讲，通常两个作用，图形加速、机器学习，要么就数字货币挖矿，但这几年矿场倒的多。
+不论如何，GPU 拼的都是算力，比 CPU 高很多。作为将近 8 年的卡巴基佬（手动斜眼），日常除了看各路
+显卡性能 PK、购买攻略外，没做过其他研究，实在惭愧。
+
+前些时间，在工作中要处理两个 CSV 文件，里面都是数字，要找出第二个文件比第一个多的数。
+最先想到自然是用 Excel，VLOOKUP 函数能办到，但只能单核运行。两个文件较大时，就非常慢，
+比如几十万行时，经常 Excel 无响应几分钟然后卡死。
+
+举例来说，文件 a.csv
+
+```csv
+1
+2
+3
+```
+
+文件 b.csv
+```csv
+2
+1
+4
+3
+5
+```
+
+要做的便找出在 b.csv 但不在 a.csv 的数字，上例结果应为
+```csv
+4
+5
+```
+
+为了比 Excel 运算更快，我自己写了个程序。
+
+## 程序
+逻辑蛮简单的，取 b.csv 每一行与 a.csv 所有行比较，相等跳过，无的话便符合，输出。伪代码如下
+```php
+FOR n1 IN b.csv {
+    found = false
+    FOR n2 IN a.csv {
+        IF n1 == n2 {
+            found = true
+            BREAK
+        }
+    }
+    if !found {
+        print n1
+    }
+}
+```
+
+这样做的话时间复杂度为 O(n^2)，如果两个文件分别为 100k 行，则运算次数为 100 亿。
+
+由于对输出数字顺序并无要求，多核并行的话，能更好发挥 CPU 算力。把 b.csv 切分为 N 份，
+N 为核数（或称超线程数），每份分别与 a.csv 比对，最后把结果合并，这样核心数越多应该越快。
+
+拿了一些不同 CPU 的机器分别试了试，得到以下结果。
+
+| CPU | Cores, Threads | Base Freq | OS | Single core | Multicores |
+|--|:--:|--|--|--|--|
+| AMD R5 3600 | 6, 12 | 3.6GHz | Windows 10 | | 27.2s |
+| Intel i5-7500  | 4, 4 | 3.4GHz | macOS 10.14 | 3m01.35s | 48.856s |
+| Intel i5-7267U | 2, 4 | 3.1GHz | macOS 10.14 | - | 1m36.56s |
+| Intel i5-2520M | 2, 4 | 2.5GHz | Linux 5.5   | 4m41.82s | 2m16.95s |
+| Intel Atom N270 | 1, 2 | 1.6GHz | Linux 5.5 | - | ~15m |
+
+
+这个 csv 文件处理的程序，放在了 GitHub https://github.com/zyfdegh/csvdiff
+
+可以跑一下看看你的 CPU 耗时多久，欢迎在下面留言。
+
+## CUDA
+后来一直在想怎样提升速度，不改变算法复杂度的话，貌似只能加钱升级 AMD YES 了。这时想到手头一台老笔电
+装有 nVidia GT635M 弱鸡显卡，平常打 CSGO 都非常吃力，开着还白耗电，不如折腾下看看。
+
+以前听闻过各路 ML 大佬谈笑风生 tf, pytorch, dnn, cuda, nvidia-docker 之类的，作为菜鸟只能
+在一旁黯然，也未曾想过去利用 GPU 来编程做通用计算，上老黄官网查了几天下来，逐渐对 GPU 运算有了了解。
+
+## 相关链接
+
+1. GPU Accelerated Computing with C and C++
+https://developer.nvidia.com/how-to-cuda-c-cpp
+
+2. CUDA Quick Start Guide (Install, Run Sample)
+https://docs.nvidia.com/cuda/cuda-quick-start-guide/index.html
+
+3. An Even Easier Introduction to CUDA
+https://devblogs.nvidia.com/even-easier-introduction-cuda/
+
+4. CUDA Compatibility
+https://docs.nvidia.com/deploy/cuda-compatibility/index.html
\ No newline at end of file
diff --git a/content/post/202005-cuda-beginner/add-block.cu b/content/post/202005-cuda-beginner/add-block.cu
new file mode 100644
index 0000000..2679078
--- /dev/null
+++ b/content/post/202005-cuda-beginner/add-block.cu
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <math.h>
+// Kernel function to add the elements of two arrays
+__global__
+void add(int n, float *x, float *y)
+{
+  int index = threadIdx.x;
+  int stride = blockDim.x;
+  for (int i = index; i < n; i += stride)
+    y[i] = x[i] + y[i];
+}
+
+int main(void)
+{
+  int N = 1<<20;
+  float *x, *y;
+
+  // Allocate Unified Memory – accessible from CPU or GPU
+  cudaMallocManaged(&x, N*sizeof(float));
+  cudaMallocManaged(&y, N*sizeof(float));
+
+  // initialize x and y arrays on the host
+  for (int i = 0; i < N; i++) {
+    x[i] = 1.0f;
+    y[i] = 2.0f;
+  }
+
+  // Run kernel on 1M elements on the GPU
+  add<<<1, 256>>>(N, x, y);
+
+  // Wait for GPU to finish before accessing on host
+  cudaDeviceSynchronize();
+
+  // Check for errors (all values should be 3.0f)
+  float maxError = 0.0f;
+  for (int i = 0; i < N; i++)
+    maxError = fmax(maxError, fabs(y[i]-3.0f));
+  std::cout << "Max error: " << maxError << std::endl;
+
+  // Free memory
+  cudaFree(x);
+  cudaFree(y);
+  
+  return 0;
+}
\ No newline at end of file
diff --git a/content/post/202005-cuda-beginner/add.cpp b/content/post/202005-cuda-beginner/add.cpp
new file mode 100644
index 0000000..dd6e601
--- /dev/null
+++ b/content/post/202005-cuda-beginner/add.cpp
@@ -0,0 +1,38 @@
+#include <iostream>
+#include <math.h>
+
+// function to add the elements of two arrays
+void add(int n, float *x, float *y)
+{
+  for (int i = 0; i < n; i++)
+      y[i] = x[i] + y[i];
+}
+
+int main(void)
+{
+  int N = 1<<20; // 1M elements
+
+  float *x = new float[N];
+  float *y = new float[N];
+
+  // initialize x and y arrays on the host
+  for (int i = 0; i < N; i++) {
+    x[i] = 1.0f;
+    y[i] = 2.0f;
+  }
+
+  // Run kernel on 1M elements on the CPU
+  add(N, x, y);
+
+  // Check for errors (all values should be 3.0f)
+  float maxError = 0.0f;
+  for (int i = 0; i < N; i++)
+    maxError = fmax(maxError, fabs(y[i]-3.0f));
+  std::cout << "Max error: " << maxError << std::endl;
+
+  // Free memory
+  delete [] x;
+  delete [] y;
+
+  return 0;
+}
\ No newline at end of file
diff --git a/content/post/202005-cuda-beginner/add.cu b/content/post/202005-cuda-beginner/add.cu
new file mode 100644
index 0000000..1165c94
--- /dev/null
+++ b/content/post/202005-cuda-beginner/add.cu
@@ -0,0 +1,43 @@
+#include <iostream>
+#include <math.h>
+// Kernel function to add the elements of two arrays
+__global__
+void add(int n, float *x, float *y)
+{
+  for (int i = 0; i < n; i++)
+    y[i] = x[i] + y[i];
+}
+
+int main(void)
+{
+  int N = 1;
+  float *x, *y;
+
+  // Allocate Unified Memory – accessible from CPU or GPU
+  cudaMallocManaged(&x, N*sizeof(float));
+  cudaMallocManaged(&y, N*sizeof(float));
+
+  // initialize x and y arrays on the host
+  for (int i = 0; i < N; i++) {
+    x[i] = 1.0f;
+    y[i] = 2.0f;
+  }
+
+  // Run kernel on 1M elements on the GPU
+  add<<<1, 1>>>(N, x, y);
+
+  // Wait for GPU to finish before accessing on host
+  cudaDeviceSynchronize();
+
+  // Check for errors (all values should be 3.0f)
+  float maxError = 0.0f;
+  for (int i = 0; i < N; i++)
+    maxError = fmax(maxError, fabs(y[i]-3.0f));
+  std::cout << "Max error: " << maxError << std::endl;
+
+  // Free memory
+  cudaFree(x);
+  cudaFree(y);
+  
+  return 0;
+}
\ No newline at end of file