From 151ffe8dd897996e3f82796572be7c2172485f2e Mon Sep 17 00:00:00 2001 From: Sam Hadow Date: Fri, 18 Apr 2025 09:46:23 +0200 Subject: [PATCH] pagerank implementation --- .gitignore | 3 ++ Makefile | 51 ++++++++++++++++++++ src/main.c | 34 +++++++++++++ src/matrix_operation.c | 68 ++++++++++++++++++++++++++ src/matrix_operation.h | 9 ++++ src/matrix_print.c | 21 +++++++++ src/matrix_print.h | 8 ++++ src/power_algorithm.c | 78 ++++++++++++++++++++++++++++++ src/power_algorithm.h | 8 ++++ src/read_from_mtx.c | 71 ++++++++++++++++++++++++++++ src/read_from_mtx.h | 7 +++ src/read_from_rb.c | 105 +++++++++++++++++++++++++++++++++++++++++ src/read_from_rb.h | 8 ++++ src/sparse_matrix.c | 11 +++++ src/sparse_matrix.h | 21 +++++++++ src/time_helper.c | 12 +++++ src/time_helper.h | 6 +++ src/vector.c | 43 +++++++++++++++++ src/vector.h | 11 +++++ 19 files changed, 575 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 src/main.c create mode 100644 src/matrix_operation.c create mode 100644 src/matrix_operation.h create mode 100644 src/matrix_print.c create mode 100644 src/matrix_print.h create mode 100644 src/power_algorithm.c create mode 100644 src/power_algorithm.h create mode 100644 src/read_from_mtx.c create mode 100644 src/read_from_mtx.h create mode 100644 src/read_from_rb.c create mode 100644 src/read_from_rb.h create mode 100644 src/sparse_matrix.c create mode 100644 src/sparse_matrix.h create mode 100644 src/time_helper.c create mode 100644 src/time_helper.h create mode 100644 src/vector.c create mode 100644 src/vector.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..031f49a --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.pdf +out/ +data/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e275ddf --- /dev/null +++ b/Makefile @@ -0,0 +1,51 @@ +# -------------------------------------------------- +# Configuration +# -------------------------------------------------- +CC := gcc +CFLAGS := -Wall -fopenmp -O3 +SRCDIR := src +OBJDIR := out +DATAPATH := data/web-Google/web-Google.mtx + +# +SRCS := $(wildcard $(SRCDIR)/*.c) +OBJS := $(patsubst $(SRCDIR)/%.c,$(OBJDIR)/%.o,$(SRCS)) + +# -------------------------------------------------- +# Phony targets +# -------------------------------------------------- +.PHONY: all sparse clean + +all: sparse + +sparse: $(OBJDIR)/sparse | $(OBJDIR) + @echo "→ Running sparse" + ./$(OBJDIR)/sparse + +# -------------------------------------------------- +# Link +# -------------------------------------------------- +$(OBJDIR)/sparse: $(OBJS) $(DATAPATH) | $(OBJDIR) + @echo "→ Copying input data" + cp $(DATAPATH) $(OBJDIR)/input.rb + @echo "→ Linking $@" + $(CC) $(CFLAGS) -o $@ $(OBJS) + +# -------------------------------------------------- +# Compile +# -------------------------------------------------- +$(OBJDIR)/%.o: $(SRCDIR)/%.c | $(OBJDIR) + @echo "→ Compiling $<" + $(CC) $(CFLAGS) -c $< -o $@ + +# -------------------------------------------------- +# Check if output directory exists +# -------------------------------------------------- +$(OBJDIR): + mkdir -p $(OBJDIR) + +# -------------------------------------------------- +# Clean +# -------------------------------------------------- +clean: + rm -rf $(OBJDIR)/* diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..31a8fb2 --- /dev/null +++ b/src/main.c @@ -0,0 +1,34 @@ +#include +#include +#include "matrix_operation.h" +#include "sparse_matrix.h" +#include "read_from_rb.h" +#include "power_algorithm.h" +#include "time_helper.h" +#include + +void test_pagerank(const char *path) { + struct timeval tvstart, tv; + gettimeofday(&tvstart, NULL); + + SparseMatrix *matrix = read_sparse_matrix_from_mtx(path); + convert_to_stochastic(matrix); + // Time 2 + gettimeofday(&tv, NULL); + print_time_diff("read matrix", &tvstart, &tv); + double *result = malloc(matrix->num_nodes * sizeof(double)); + if (result == NULL) { + fprintf(stderr, "Memory allocation failed\n"); + exit(EXIT_FAILURE); + } + result = pagerank(matrix, 1e-10, 0.35); + // Time 3 + gettimeofday(&tv, NULL); + print_time_diff("finish", &tvstart, &tv); +} + +int main() { + test_pagerank("./out/input.rb"); + + return 0; +} diff --git a/src/matrix_operation.c b/src/matrix_operation.c new file mode 100644 index 0000000..5193b87 --- /dev/null +++ b/src/matrix_operation.c @@ -0,0 +1,68 @@ +#include "matrix_operation.h" +#include "vector.h" +#include +#include +#include "sparse_matrix.h" + +void multiply_vector_matrix(const double *vector, const SparseMatrix *matrix, double *result) { + init_vector(result, matrix->num_nodes, 0.0); + for (int i = 0; i < matrix->num_arcs; ++i) { + int origin = matrix->arcs[i].origin; + int dest = matrix->arcs[i].dest; + result[dest] += vector[origin] * matrix->arcs[i].value; + } +} + + +void multiply_vector_matrix_parallel(const double *vector, const SparseMatrix *matrix, double *result) { + int num_nodes = matrix->num_nodes; + int num_arcs = matrix->num_arcs; + init_vector(result, num_nodes, 0.0); + + #pragma omp parallel + { + double *local_result = (double *)calloc(num_nodes, sizeof(double)); + + // parallelize for loop + #pragma omp for + for (int i = 0; i < num_arcs; ++i) { + int origin = matrix->arcs[i].origin; + int dest = matrix->arcs[i].dest; + local_result[dest] += vector[origin] * matrix->arcs[i].value; + } + + // merge results + for (int j = 0; j < num_nodes; ++j) { + double val = local_result[j]; + if (val != 0.0) { + #pragma omp atomic + result[j] += val; + } + } + + free(local_result); + } +} + +void convert_to_stochastic(SparseMatrix *matrix) { + double *non_zero = (double *)calloc(matrix->num_nodes, sizeof(double)); + if (!non_zero) { + fprintf(stderr, "Memory allocation failed\n"); + return; + } + + // count non zero values + for (int i = 0; i < matrix->num_arcs; i++) { + non_zero[matrix->arcs[i].dest] += matrix->arcs[i].value; + } + + // normalize values + for (int i = 0; i < matrix->num_arcs; i++) { + int dest = matrix->arcs[i].dest; + double old_value = matrix->arcs[i].value; + matrix->arcs[i].value = old_value / (double)non_zero[dest]; + } + + free(non_zero); +} + diff --git a/src/matrix_operation.h b/src/matrix_operation.h new file mode 100644 index 0000000..b51bd4a --- /dev/null +++ b/src/matrix_operation.h @@ -0,0 +1,9 @@ +#ifndef MATRIX_OPERATION_H +#define MATRIX_OPERATION_H +#include "sparse_matrix.h" + +void multiply_vector_matrix(const double *vector, const SparseMatrix *matrix, double *result); +void multiply_vector_matrix_parallel(const double *vector, const SparseMatrix *matrix, double *result); +void convert_to_stochastic(SparseMatrix *matrix); + +#endif diff --git a/src/matrix_print.c b/src/matrix_print.c new file mode 100644 index 0000000..c70e24a --- /dev/null +++ b/src/matrix_print.c @@ -0,0 +1,21 @@ +#include +#include "sparse_matrix.h" + +void print_vector(double *vector, int size) { + for (int i = 0; i < size; ++i) { + printf("%lf ", vector[i]); + } + printf("\n"); +} + +void print_sparse_matrix(const SparseMatrix *matrix) { + if (matrix) { + printf("%d %d %d\n", matrix->num_nodes, matrix->num_nodes, matrix->num_arcs); + for (int i = 0; i < matrix->num_arcs; ++i) { + printf("%d ", matrix->arcs[i].origin + 1); + printf("%d ", matrix->arcs[i].dest + 1); + printf("%.10f ", matrix->arcs[i].value); + printf("\n"); + } + } +} diff --git a/src/matrix_print.h b/src/matrix_print.h new file mode 100644 index 0000000..45a410f --- /dev/null +++ b/src/matrix_print.h @@ -0,0 +1,8 @@ +#ifndef MATRIX_PRINT_H +#define MATRIX_PRINT_H +#include "sparse_matrix.h" + +void print_vector(double *vector, int size); +void print_sparse_matrix(const SparseMatrix *matrix); + +#endif diff --git a/src/power_algorithm.c b/src/power_algorithm.c new file mode 100644 index 0000000..46d6fce --- /dev/null +++ b/src/power_algorithm.c @@ -0,0 +1,78 @@ +#include +#include +#include +#include "sparse_matrix.h" +#include "matrix_operation.h" +#include "vector.h" + +double* power_algorithm_step(const SparseMatrix *matrix, const double *pi) { + double* result = malloc(matrix->num_nodes*sizeof(double)); + multiply_vector_matrix_parallel(pi, matrix, result); + return result; +} + +double* power_algorithm(const SparseMatrix *matrix, double epsilon) { + int N = matrix->num_nodes; + int vecsize = N*sizeof(double); + double* pi = malloc(vecsize); + double* pi2 = malloc(vecsize); + init_vector(pi, N, 1.0/(double)N); + pi2 = power_algorithm_step(matrix, pi); + while (diff_norm_vector(pi, pi2, N)>epsilon) { + printf("step\n"); + memcpy(pi, pi2, vecsize); + pi2 = power_algorithm_step(matrix, pi); + } + return pi2; +} + +double* pagerank(const SparseMatrix *matrix, double epsilon, double alpha) { + int N = matrix->num_nodes; + size_t vec_size = N * sizeof(double); + + double* pi = malloc(vec_size); + double* pi_new = malloc(vec_size); + double* f = malloc(vec_size); + double right_const = (1.0 - alpha) / N; + + init_vector(pi, N, 1.0 / N); + + generate_f(matrix, f); + + double diff; + int iter = 0; + + do { + // 1. pi * M + double* temp = power_algorithm_step(matrix, pi); + + // 2. alpha/N * (pi * f) + double right_var = (alpha/(double)N) * vec_product(pi, f, N); + + // 3. alpha*(pi*M) + (right_const+alpha/N * (pi * f))*e + for (int i = 0; i < N; i++) { + pi_new[i] = alpha * temp[i] + right_const + right_var; + } + + // 4. Normalize + normalize_vector(pi_new, N); + + // 5. Calculate convergence + diff = diff_norm_vector(pi, pi_new, N); + + // 6. Update for next iteration + free(pi); + pi = pi_new; + pi_new = malloc(vec_size); + + if ((++iter)%1 == 0) { + printf("Iteration %d: diff = %.16f\n", iter, diff); + } + free(temp); + } while (diff > epsilon); + + free(pi_new); + free(f); + return pi; +} + diff --git a/src/power_algorithm.h b/src/power_algorithm.h new file mode 100644 index 0000000..a2ef5f0 --- /dev/null +++ b/src/power_algorithm.h @@ -0,0 +1,8 @@ +#ifndef POWER_ALGORITHM_H +#define POWER_ALGORITHM_H +#include "sparse_matrix.h" + +double* power_algorithm(const SparseMatrix *matrix, double epsilon); +double* pagerank(const SparseMatrix *matrix, double epsilon, double alpha); + +#endif diff --git a/src/read_from_mtx.c b/src/read_from_mtx.c new file mode 100644 index 0000000..b56dad3 --- /dev/null +++ b/src/read_from_mtx.c @@ -0,0 +1,71 @@ +#include +#include +#include "sparse_matrix.h" + +int read_dims_ignore_comment(SparseMatrix *matrix, FILE *file) { + char buffer[1024]; + int read_dims = 0; + while (fgets(buffer, sizeof(buffer), file) != NULL) { + char *line = buffer; + if (*line == '%' || *line == '\n' || *line == '\0') { + continue; + } else { + if (sscanf(line, "%d %d", &matrix->num_nodes, &matrix->num_arcs) != 2) { + return 1; + } + read_dims = 1; + break; + } + } + return (!read_dims); +} + +void parse_arcs(SparseMatrix *matrix, FILE *file) { + for (int i = 0; i < matrix->num_arcs; ++i) { + char line[256]; + if (fgets(line, sizeof(line), file) == NULL) { + fprintf(stderr, "Failed to read arc %d\n", i); + exit(5); + } + char *ptr = line; + int origin, dest; + double value = 1.0; + if (sscanf(ptr, "%d %d %lf", &origin, &dest, &value) < 2) { + fprintf(stderr, "Failed to read arc %d\n", i); + exit(5); + } else { + matrix->arcs[i].origin = origin; + matrix->arcs[i].dest = dest; + matrix->arcs[i].value = value; + } + } +} + +SparseMatrix* read_sparse_matrix_from_mtx(const char *filename) { + FILE *file = fopen(filename, "r"); + if (!file) { + fprintf(stderr, "Failed to read file\n"); + exit(1); + } + + SparseMatrix *matrix = (SparseMatrix*)malloc(sizeof(SparseMatrix)); + if (!matrix) { + fprintf(stderr, "Failed to allocate memory for matrix\n"); + exit(3); + } + + if (read_dims_ignore_comment(matrix, file)) { + fprintf(stderr, "Failed to read matrix dimensions\n"); + } + + matrix->arcs = (Arc*)malloc(matrix->num_arcs * sizeof(Arc)); + if (!matrix->arcs) { + fprintf(stderr, "Failed to allocate memory for arcs\n"); + exit(4); + } + + parse_arcs(matrix, file); + + fclose(file); + return matrix; +} diff --git a/src/read_from_mtx.h b/src/read_from_mtx.h new file mode 100644 index 0000000..7644510 --- /dev/null +++ b/src/read_from_mtx.h @@ -0,0 +1,7 @@ +#ifndef MATRIX_READ_MTX_H +#define MATRIX_READ_MTX_H +#include "sparse_matrix.h" + +SparseMatrix* read_sparse_matrix_from_mtx(const char *filename); + +#endif diff --git a/src/read_from_rb.c b/src/read_from_rb.c new file mode 100644 index 0000000..a7d6924 --- /dev/null +++ b/src/read_from_rb.c @@ -0,0 +1,105 @@ +#include +#include +#include "read_from_rb.h" +#include "sparse_matrix.h" + +SparseMatrix* read_sparse_matrix_from_rb(const char *filename) { + FILE *file = fopen(filename, "r"); + if (!file) { + fprintf(stderr, "Failed to open file %s\n", filename); + exit(1); + } + + SparseMatrix *matrix = (SparseMatrix*)malloc(sizeof(SparseMatrix)); + if (!matrix) { + fprintf(stderr, "Failed to allocate memory for matrix\n"); + exit(2); + } + + char buffer[256]; + fgets(buffer, sizeof(buffer), file); + fgets(buffer, sizeof(buffer), file); + + fgets(buffer, sizeof(buffer), file); + char type[4]; + int rows, cols, nonzero; + if (sscanf(buffer, "%3s %d %d %d", type, &rows, &cols, &nonzero) != 4) { + fprintf(stderr, "Failed to read matrix metadata\n"); + exit(3); + } + + matrix->num_nodes = rows; + matrix->num_arcs = nonzero; + matrix->arcs = (Arc*)malloc(nonzero * sizeof(Arc)); + if (!matrix->arcs) { + fprintf(stderr, "Failed to allocate memory for arcs\n"); + exit(4); + } + + fgets(buffer, sizeof(buffer), file); + + int *col_ptr = (int*)malloc((cols + 1) * sizeof(int)); + int ptr_index = 0; + while (ptr_index < cols + 1) { + if (fgets(buffer, sizeof(buffer), file) == NULL) { + fprintf(stderr, "Unexpected end of file while reading column pointers\n"); + exit(5); + } + char *ptr = buffer; + int num_read; + while (sscanf(ptr, "%d%n", &col_ptr[ptr_index], &num_read) == 1) { + ptr += num_read; + ptr_index++; + } + } + + int *row_ind = (int*)malloc(nonzero * sizeof(int)); + int row_index = 0; + while (row_index < nonzero) { + if (fgets(buffer, sizeof(buffer), file) == NULL) { + fprintf(stderr, "Unexpected end of file while reading row indices\n"); + exit(6); + } + char *ptr = buffer; + int num_read; + while (sscanf(ptr, "%d%n", &row_ind[row_index], &num_read) == 1) { + ptr += num_read; + row_index++; + } + } + + double *values = (double*)malloc(nonzero * sizeof(double)); + int val_index = 0; + while (val_index < nonzero) { + if (fgets(buffer, sizeof(buffer), file) == NULL) { + fprintf(stderr, "Unexpected end of file while reading values\n"); + exit(7); + } + char *ptr = buffer; + int num_read; + double val; + while (sscanf(ptr, "%lf%n", &val, &num_read) == 1) { + values[val_index] = val; + ptr += num_read; + val_index++; + } + } + + int arc_idx = 0; + for (int j = 0; j < cols; j++) { + int start = col_ptr[j] - 1; + int end = col_ptr[j + 1] - 1; + for (int k = start; k < end; k++) { + matrix->arcs[arc_idx].origin = j; + matrix->arcs[arc_idx].dest = row_ind[k] - 1; + matrix->arcs[arc_idx].value = values[k]; + arc_idx++; + } + } + + free(col_ptr); + free(row_ind); + free(values); + fclose(file); + return matrix; +} diff --git a/src/read_from_rb.h b/src/read_from_rb.h new file mode 100644 index 0000000..c409329 --- /dev/null +++ b/src/read_from_rb.h @@ -0,0 +1,8 @@ +#ifndef RB_READER_H +#define RB_READER_H + +#include "sparse_matrix.h" + +SparseMatrix* read_sparse_matrix_from_rb(const char *filename); + +#endif diff --git a/src/sparse_matrix.c b/src/sparse_matrix.c new file mode 100644 index 0000000..1e705b1 --- /dev/null +++ b/src/sparse_matrix.c @@ -0,0 +1,11 @@ +#include "sparse_matrix.h" +#include + +void free_sparse_matrix(SparseMatrix *matrix) { + if (matrix) { + free(matrix->arcs); + free(matrix); + } +} + + diff --git a/src/sparse_matrix.h b/src/sparse_matrix.h new file mode 100644 index 0000000..c5516bc --- /dev/null +++ b/src/sparse_matrix.h @@ -0,0 +1,21 @@ +#ifndef SPARSE_MATRIX_H +#define SPARSE_MATRIX_H + +typedef struct { + int origin; + int dest; + double value; +} Arc; + +typedef struct { + int num_nodes; + int num_arcs; + Arc *arcs; +} SparseMatrix; + +void free_sparse_matrix(SparseMatrix *matrix); +void print_sparse_matrix(const SparseMatrix *matrix); +SparseMatrix* read_sparse_matrix_from_mtx(const char *filename); +void multiply_vector_matrix(const double *vector, const SparseMatrix *matrix, double *result); + +#endif diff --git a/src/time_helper.c b/src/time_helper.c new file mode 100644 index 0000000..0753e2c --- /dev/null +++ b/src/time_helper.c @@ -0,0 +1,12 @@ +#include +#include + +void print_time_diff(const char* label, struct timeval* start, struct timeval* end) { + long seconds = end->tv_sec - start->tv_sec; + long microseconds = end->tv_usec - start->tv_usec; + if (microseconds < 0) { + seconds -= 1; + microseconds += 1000000; + } + printf("%s: %ld.%ld seconds\n", label, seconds, microseconds); +} diff --git a/src/time_helper.h b/src/time_helper.h new file mode 100644 index 0000000..c0e4eb5 --- /dev/null +++ b/src/time_helper.h @@ -0,0 +1,6 @@ +#ifndef TIME_HELPER_H +#define TIME_HELPER_H + +void print_time_diff(const char* label, struct timeval* start, struct timeval* end); + +#endif diff --git a/src/vector.c b/src/vector.c new file mode 100644 index 0000000..cc46439 --- /dev/null +++ b/src/vector.c @@ -0,0 +1,43 @@ +#include +#include "sparse_matrix.h" + +void init_vector(double *vector, int size, double value) { + for (int i = 0; i < size; ++i) { + vector[i] = value; + } +} + +double diff_norm_vector(double *vector1, double *vector2, int size) { + double res = 0.0; + for (int i = 0; i < size; ++i) { + res += fabs(vector1[i] - vector2[i]); + } + return res; +} + +void generate_f(const SparseMatrix *matrix, double *res) { + int N = matrix->num_nodes; + init_vector(res, N, 0); + int num_arcs = matrix->num_arcs; + for (int i = 0; i < num_arcs; ++i) { + res[matrix->arcs[i].dest] = 1; + } +} + +double vec_product(const double* v1, const double* v2, int N) { + double sum = 0.0; + for (int i = 0; i < N; i++) { + sum += v1[i] * v2[i]; + } + return sum; +} + +void normalize_vector(double* v, int N) { + double sum = 0.0; + for (int i = 0; i < N; i++) { + sum += v[i]; + } + for (int i = 0; i < N; i++) { + v[i] /= sum; + } +} diff --git a/src/vector.h b/src/vector.h new file mode 100644 index 0000000..c771211 --- /dev/null +++ b/src/vector.h @@ -0,0 +1,11 @@ +#ifndef VECTOR_H +#define VECTOR_H +#include "sparse_matrix.h" + +void init_vector(double *vector, int size, double value); +double diff_norm_vector(double *vector1, double *vector2, int size); +void generate_f(const SparseMatrix *matrix, double *res); +void normalize_vector(double* v, int N); +double vec_product(const double* v1, const double* v2, int N); + +#endif