时间来不及了只有操作,报告让agent自己搓

  1. 登录集群
1
2
3
ssh -p 端口号 用户名@211.81.52.30
mkdir lab4
cd lab4
  1. 生成测试数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
cat > gen_data.py << 'EOF'
import sys

n = 1000000 # 100万维度
nnz_per_row = 9 # 每行9个非零元

print(f"正在生成 {n}x{n} 矩阵,每行 {nnz_per_row} 个非零元...")
print("这可能需要一些时间,请耐心等待...")

values = []
col_idx = []
row_ptr = [0]

for i in range(n):
# 确保对称正定:每行有对角线 + 左右各4个非零元
# 对角线元素
values.append(float(i % 100 + 10)) # 对角线值在10-109之间
col_idx.append(i)

# 右侧非零元(对称性保证左侧也有对应的非零元)
for j in range(1, 5):
if i + j < n:
values.append(1.0)
col_idx.append(i + j)

# 左侧非零元(由右侧对称产生,但需要单独添加)
for j in range(1, 5):
if i - j >= 0:
# 只有当对称位置还未添加时才添加
# 由于遍历到i时,i-1, i-2等可能已经在之前的行添加过右侧了
# 这里为了简单,我们添加所有左侧,但会导致重复计数
# 实际CSR存储需要正确处理,我们使用更合理的方式
pass

row_ptr.append(len(values))

# 上面的简单方法有问题,重新用正确的方式生成带频带的矩阵
# 重新生成
values = []
col_idx = []
row_ptr = [0]

# 生成带宽为4的对称正定矩阵(每行最多9个非零元:4左+对角线+4右)
# 使用对角线占优保证正定性
for i in range(n):
# 对角线元素 (保证对角占优)
diag_val = 20.0
values.append(diag_val)
col_idx.append(i)

# 右侧非零元
for j in range(1, 5):
if i + j < n:
# 副对角线元素值,对称矩阵所以左侧对应位置也会是相同值
off_val = 1.0
values.append(off_val)
col_idx.append(i + j)

# 左侧非零元:只添加row_ptr信息,实际值由对应右侧行添加时已经包含
# CSR格式中,左侧非零元会在对应的i-j行作为右侧添加
# 所以这里不需要重复添加,只需记录当前行的非零元范围

row_ptr.append(len(values))

# 计算实际非零元数量
nnz = len(values)
print(f"实际非零元数量: {nnz}")

# 验证:每行非零元数量
print("验证每行非零元数量(前10行):")
for i in range(min(10, n)):
start = row_ptr[i]
end = row_ptr[i+1]
print(f" 第{i}行: {end - start} 个非零元")

# 保存矩阵到文件(文件会很大,约80-100MB)
print("\n保存矩阵到 matrix.txt...")
with open('matrix.txt', 'w') as f:
f.write(f"{n} {nnz}\n")
# 写入row_ptr(每行一个,节省空间)
for ptr in row_ptr:
f.write(f"{ptr} ")
f.write("\n")
# 写入col_idx
for idx in col_idx:
f.write(f"{idx} ")
f.write("\n")
# 写入values
for val in values:
f.write(f"{val} ")
f.write("\n")

print("矩阵保存完成!")

# 生成右端项 b = A * ones
print("生成右端向量...")
b = [0.0] * n
# 分批处理避免内存问题
batch_size = 10000
for batch_start in range(0, n, batch_size):
batch_end = min(batch_start + batch_size, n)
for i in range(batch_start, batch_end):
total = 0.0
for j in range(row_ptr[i], row_ptr[i+1]):
total += values[j]
b[i] = total
if batch_start % 100000 == 0:
print(f" 已处理 {batch_end}/{n} 行")

print("保存右端向量到 vector.txt...")
with open('vector.txt', 'w') as f:
f.write(f"{n}\n")
# 分批写入
for i in range(0, n, 10000):
chunk = b[i:min(i+10000, n)]
for val in chunk:
f.write(f"{val} ")
if i % 100000 == 0:
print(f" 已写入 {i}/{n}")
f.write("\n")

print(f"\n数据生成完成!")
print(f"矩阵维度: {n}x{n}")
print(f"非零元数量: {nnz}")
print(f"文件 matrix.txt 和 vector.txt 已生成")
EOF

python3 gen_data.py
  1. 创建cg_hip_large.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
cat > cg_hip_large.cpp << 'EOF'
#include <iostream>
#include <fstream>
#include <vector>
#include <cmath>
#include <chrono>
#include <algorithm>
#include <hip/hip_runtime.h>

using namespace std;

const int BLOCK_SIZE = 1024;

// GPU核函数
__global__ void spmv_kernel(int n, const double* values, const int* row_ptr,
const int* col_idx, const double* x, double* y) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row < n) {
double sum = 0.0;
int start = row_ptr[row];
int end = row_ptr[row + 1];
for (int j = start; j < end; j++) {
sum += values[j] * x[col_idx[j]];
}
y[row] = sum;
}
}

#define CHECK_HIP(cmd) { \
hipError_t error = cmd; \
if (error != hipSuccess) { \
fprintf(stderr, "HIP error at line %d: %s\n", __LINE__, hipGetErrorString(error)); \
exit(1); \
} \
}

void read_matrix_large(const string& fname, int& n, int& nnz,
vector<double>& vals, vector<int>& row_ptr, vector<int>& col_idx) {
ifstream fin(fname);
if (!fin.is_open()) {
cerr << "Cannot open file: " << fname << endl;
exit(1);
}

fin >> n >> nnz;

row_ptr.resize(n + 1);
for (int i = 0; i <= n; i++) {
fin >> row_ptr[i];
}

col_idx.resize(nnz);
for (int i = 0; i < nnz; i++) {
fin >> col_idx[i];
}

vals.resize(nnz);
for (int i = 0; i < nnz; i++) {
fin >> vals[i];
}

fin.close();
}

void read_vector_large(const string& fname, vector<double>& b) {
ifstream fin(fname);
int n;
fin >> n;
b.resize(n);
for (int i = 0; i < n; i++) {
fin >> b[i];
}
fin.close();
}

double dot(const vector<double>& a, const vector<double>& b, int n) {
double res = 0.0;
for (int i = 0; i < n; i++) {
res += a[i] * b[i];
}
return res;
}

int main(int argc, char* argv[]) {
if (argc != 4) {
cerr << "Usage: " << argv[0] << " matrix.txt vector.txt max_iter" << endl;
return 1;
}

string mat_file = argv[1];
string vec_file = argv[2];
int max_iter = atoi(argv[3]);

int n, nnz;
vector<double> values;
vector<int> row_ptr, col_idx;
vector<double> b;

read_matrix_large(mat_file, n, nnz, values, row_ptr, col_idx);
read_vector_large(vec_file, b);

vector<double> x(n, 0.0), r(n), p(n), Ap(n);

for (int i = 0; i < n; i++) {
r[i] = b[i];
p[i] = r[i];
}

double rho = dot(r, r, n);
double rho_old = rho;

if (sqrt(rho) < 1e-8) {
return 0;
}

double *d_vals, *d_p, *d_Ap;
int *d_row_ptr, *d_col_idx;

CHECK_HIP(hipMalloc(&d_vals, nnz * sizeof(double)));
CHECK_HIP(hipMalloc(&d_row_ptr, (n + 1) * sizeof(int)));
CHECK_HIP(hipMalloc(&d_col_idx, nnz * sizeof(int)));
CHECK_HIP(hipMalloc(&d_p, n * sizeof(double)));
CHECK_HIP(hipMalloc(&d_Ap, n * sizeof(double)));

CHECK_HIP(hipMemcpy(d_vals, values.data(), nnz * sizeof(double), hipMemcpyHostToDevice));
CHECK_HIP(hipMemcpy(d_row_ptr, row_ptr.data(), (n + 1) * sizeof(int), hipMemcpyHostToDevice));
CHECK_HIP(hipMemcpy(d_col_idx, col_idx.data(), nnz * sizeof(int), hipMemcpyHostToDevice));

int block_size = BLOCK_SIZE;
int grid_size = (n + block_size - 1) / block_size;

auto solve_start = chrono::high_resolution_clock::now();

int iter;
for (iter = 0; iter < max_iter; iter++) {
CHECK_HIP(hipMemcpy(d_p, p.data(), n * sizeof(double), hipMemcpyHostToDevice));

hipLaunchKernelGGL(spmv_kernel, grid_size, block_size, 0, 0,
n, d_vals, d_row_ptr, d_col_idx, d_p, d_Ap);
CHECK_HIP(hipDeviceSynchronize());

CHECK_HIP(hipMemcpy(Ap.data(), d_Ap, n * sizeof(double), hipMemcpyDeviceToHost));

double pAp = dot(p, Ap, n);
if (fabs(pAp) < 1e-15) break;

double alpha = rho_old / pAp;

for (int i = 0; i < n; i++) {
x[i] += alpha * p[i];
r[i] -= alpha * Ap[i];
}

double rho_new = dot(r, r, n);

if (sqrt(rho_new) < 1e-6) break;

double beta = rho_new / rho_old;
for (int i = 0; i < n; i++) {
p[i] = r[i] + beta * p[i];
}

rho_old = rho_new;
}

auto solve_end = chrono::high_resolution_clock::now();
double solve_time = chrono::duration<double>(solve_end - solve_start).count();
double avg_time_per_iter = (solve_time / (iter + 1)) * 1000;

cout << solve_time << " " << avg_time_per_iter << " " << iter + 1;

hipFree(d_vals);
hipFree(d_row_ptr);
hipFree(d_col_idx);
hipFree(d_p);
hipFree(d_Ap);

return 0;
}
EOF
  1. 生成一下测10次的脚本,后面的参数是BLOCK_SIZE
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
cat > run_10times.sh 1024 << 'EOF'
#!/bin/bash

if [ -z "$1" ]; then
echo "Usage: ./run_10times.sh <block_size>"
echo "Example: ./run_10times.sh 256"
exit 1
fi

BLOCK_SIZE=$1
echo "=========================================="
echo "测试 block_size = $BLOCK_SIZE"
echo "共运行10次取平均值"
echo "=========================================="

sed -i "s/const int BLOCK_SIZE = [0-9]*/const int BLOCK_SIZE = $BLOCK_SIZE/" cg_hip_large.cpp
hipcc -o cg_hip_large cg_hip_large.cpp -O3 2>/dev/null

times=""
avg_times=""

for i in $(seq 1 10); do
echo -n " 第 $i 次: "
output=$(./cg_hip_large matrix.txt vector.txt 500 2>&1 | grep -E "^[0-9]")

solve_time=$(echo "$output" | awk '{print $1}')
avg_time=$(echo "$output" | awk '{print $2}')

times="$times $solve_time"
avg_times="$avg_times $avg_time"

echo "总时间=${solve_time}s, 每迭代=${avg_time}ms"
done

echo ""
echo "========== 结果汇总 (block_size=$BLOCK_SIZE) =========="

# 计算平均总时间
echo "$times" | awk '{
sum=0;
for(i=1;i<=NF;i++) { sum+=$i; }
printf("平均总时间: %.6f s\n", sum/NF);
}'

echo "$avg_times" | awk '{
sum=0;
for(i=1;i<=NF;i++) { sum+=$i; }
printf("平均每迭代: %.6f ms\n", sum/NF);
}'

echo "=========================================="
EOF

chmod +x run_10times.sh
./run_10times.sh 1024

将1024改为 64 128 256 512 再次测试

  1. 可以更改矩阵规模进行对比实验
  2. 实验报告一如既往给出平均值和加速比