8.6 פרויקט סיכום פרויקט

פרויקט סיכום - ניתוח ביצועים - performance analysis¶

הקדמה¶

בפרויקט הזה נחבר את כל מה שלמדנו בפרק 8. נכתוב תוכניות שמדגימות תופעות ביצועים שנובעות מארכיטקטורת המעבד, נמדוד אותן, ונסביר את התוצאות באמצעות הידע שצברנו:

זכרון מטמון (פרק 8.3) - cache lines, spatial locality
חיזוי הסתעפויות (פרק 8.2) - sorted vs unsorted, branch prediction
שיתוף כוזב (פרק 8.3) - false sharing בין threads
מוני חומרה (perf) - מדידה ישירה של cache misses, branch mispredictions, IPC

הפרויקט בנוי ב-4 שלבים. בכל שלב תכתבו קוד, תריצו, תמדדו, ותכתבו הסבר של התוצאות.

שלב 1 - זכרון מטמון ו-stride¶

מה לעשות¶

כתבו תוכנית בC שמקצה מערך גדול (לפחות 64MB) וסורקת אותו עם אורכי צעד (stride) שונים: 1, 16, 64, 128, 256, 512, 1024, 4096 בתים. מדדו את זמן הגישה הממוצע עבור כל stride.

קוד התחלתי¶

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define ARRAY_SIZE (64 * 1024 * 1024)  // 64 MB
#define ITERATIONS 10

double measure_stride(char *arr, long size, int stride) {
    struct timespec start, end;
    long count = 0;
    volatile char sink;  // volatile כדי שהקומפיילר לא יבטל את הקריאות

    clock_gettime(CLOCK_MONOTONIC, &start);
    for (int iter = 0; iter < ITERATIONS; iter++) {
        for (long i = 0; i < size; i += stride) {
            sink = arr[i];
            count++;
        }
    }
    clock_gettime(CLOCK_MONOTONIC, &end);

    double elapsed = (end.tv_sec - start.tv_sec)
                   + (end.tv_nsec - start.tv_nsec) / 1e9;
    return elapsed / count * 1e9;  // ננושניות לגישה
}

int main() {
    char *arr = malloc(ARRAY_SIZE);
    if (!arr) { perror("malloc"); return 1; }
    memset(arr, 1, ARRAY_SIZE);  // חימום - ודאו שכל הדפים ממופים

    int strides[] = {1, 16, 64, 128, 256, 512, 1024, 4096};
    int num_strides = sizeof(strides) / sizeof(strides[0]);

    printf("%-10s %-15s\n", "Stride", "ns/access");
    printf("%-10s %-15s\n", "------", "---------");

    for (int s = 0; s < num_strides; s++) {
        double ns = measure_stride(arr, ARRAY_SIZE, strides[s]);
        printf("%-10d %-15.2f\n", strides[s], ns);
    }

    free(arr);
    return 0;
}

מה צריך לספק¶

קמפלו עם gcc -O2 -o cache_stride cache_stride.c והריצו.
תעדו את התוצאות בטבלה.
הסבירו: למה הביצועים משתנים עם הstride? ספציפית:
למה stride=1 ו-stride=16 דומים?
למה יש קפיצה ב-stride=64? (רמז: גודל cache line)
למה stride=4096 האיטי ביותר? (רמז: TLB)

תוצאות צפויות (בקירוב)¶

Stride	ns/access	הסבר
1	0.3-0.5	כל 64 גישות miss אחד. 63/64 hits.
16	0.5-1.0	כל 4 גישות miss אחד. 3/4 hits.
64	2-5	כל גישה miss! (stride = cache line)
128	2-5	כל גישה miss (דומה ל-64)
256	2-5	כל גישה miss
512	3-6	כל גישה miss + מתחילים לחרוג מL2
1024	4-8	miss + TLB pressure מתחיל
4096	5-15	כל גישה cache miss + TLB miss!

שלב 2 - חיזוי הסתעפויות¶

מה לעשות¶

כתבו את הbenchmark המפורסם של sorted-vs-unsorted. הריצו עם מערך ממוין ועם מערך מעורבב, ומדדו את ההבדל.

קוד התחלתי¶

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define N 100000
#define REPS 500

int compare_ints(const void *a, const void *b) {
    return (*(int *)a - *(int *)b);
}

void shuffle(int *arr, int n) {
    for (int i = n - 1; i > 0; i--) {
        int j = rand() % (i + 1);
        int tmp = arr[i];
        arr[i] = arr[j];
        arr[j] = tmp;
    }
}

int main() {
    int *arr = malloc(N * sizeof(int));
    srand(42);
    for (int i = 0; i < N; i++)
        arr[i] = rand() % 256;

    struct timespec start, end;

    // --- ניסוי 1: מערך ממוין ---
    qsort(arr, N, sizeof(int), compare_ints);

    clock_gettime(CLOCK_MONOTONIC, &start);
    long long sum_sorted = 0;
    for (int r = 0; r < REPS; r++) {
        for (int i = 0; i < N; i++) {
            if (arr[i] >= 128)
                sum_sorted += arr[i];
        }
    }
    clock_gettime(CLOCK_MONOTONIC, &end);
    double t_sorted = (end.tv_sec - start.tv_sec)
                    + (end.tv_nsec - start.tv_nsec) / 1e9;

    // --- ניסוי 2: מערך מעורבב ---
    shuffle(arr, N);

    clock_gettime(CLOCK_MONOTONIC, &start);
    long long sum_unsorted = 0;
    for (int r = 0; r < REPS; r++) {
        for (int i = 0; i < N; i++) {
            if (arr[i] >= 128)
                sum_unsorted += arr[i];
        }
    }
    clock_gettime(CLOCK_MONOTONIC, &end);
    double t_unsorted = (end.tv_sec - start.tv_sec)
                      + (end.tv_nsec - start.tv_nsec) / 1e9;

    printf("sorted:   %.3f sec (sum=%lld)\n", t_sorted, sum_sorted);
    printf("unsorted: %.3f sec (sum=%lld)\n", t_unsorted, sum_unsorted);
    printf("ratio:    %.2fx\n", t_unsorted / t_sorted);

    // --- ניסוי 3 (בונוס): גרסה branchless ---
    clock_gettime(CLOCK_MONOTONIC, &start);
    long long sum_branchless = 0;
    for (int r = 0; r < REPS; r++) {
        for (int i = 0; i < N; i++) {
            int mask = -(arr[i] >= 128);
            sum_branchless += arr[i] & mask;
        }
    }
    clock_gettime(CLOCK_MONOTONIC, &end);
    double t_branchless = (end.tv_sec - start.tv_sec)
                        + (end.tv_nsec - start.tv_nsec) / 1e9;

    printf("branchless: %.3f sec (sum=%lld)\n",
           t_branchless, sum_branchless);

    free(arr);
    return 0;
}

מה צריך לספק¶

הריצו והציגו את התוצאות.
הסבירו: למה הממוין מהיר? מה קורה בחזאי ההסתעפויות?
הסבירו: למה הbranchless version עובד טוב גם על מערך מעורבב?
האם הברנצ'לס תמיד הכי מהיר? מתי לא?

מה לעשות¶

צרו שני threads שכל אחד מגדיל משתנה משלו. בניסוי ראשון, המשתנים צמודים (false sharing). בניסוי שני, מופרדים על ידי padding.

קוד התחלתי¶

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>

#define COUNT 200000000L  // 200 מיליון

// --- גרסה 1: בלי padding (false sharing) ---
struct shared_counters {
    volatile long c0;
    volatile long c1;
};

// --- גרסה 2: עם padding ---
struct padded_counters {
    volatile long c0;
    char pad[64 - sizeof(long)];  // ריפוד לcache line נפרד
    volatile long c1;
};

void *worker(void *arg) {
    volatile long *counter = (volatile long *)arg;
    for (long i = 0; i < COUNT; i++) {
        (*counter)++;
    }
    return NULL;
}

double run_test(void *counter0, void *counter1) {
    pthread_t t0, t1;
    struct timespec start, end;

    clock_gettime(CLOCK_MONOTONIC, &start);
    pthread_create(&t0, NULL, worker, counter0);
    pthread_create(&t1, NULL, worker, counter1);
    pthread_join(t0, NULL);
    pthread_join(t1, NULL);
    clock_gettime(CLOCK_MONOTONIC, &end);

    return (end.tv_sec - start.tv_sec)
         + (end.tv_nsec - start.tv_nsec) / 1e9;
}

int main() {
    // ניסוי 1: false sharing
    struct shared_counters shared = {0, 0};
    double t_shared = run_test(&shared.c0, &shared.c1);

    // ניסוי 2: padded
    struct padded_counters padded = {0, .c1 = 0};
    double t_padded = run_test(&padded.c0, &padded.c1);

    // ניסוי 3 (בונוס): thread בודד כ-baseline
    struct timespec start, end;
    volatile long single = 0;
    clock_gettime(CLOCK_MONOTONIC, &start);
    for (long i = 0; i < COUNT; i++) single++;
    clock_gettime(CLOCK_MONOTONIC, &end);
    double t_single = (end.tv_sec - start.tv_sec)
                    + (end.tv_nsec - start.tv_nsec) / 1e9;

    printf("single thread:          %.3f sec\n", t_single);
    printf("two threads (shared):   %.3f sec\n", t_shared);
    printf("two threads (padded):   %.3f sec\n", t_padded);
    printf("shared/padded ratio:    %.2fx\n", t_shared / t_padded);
    printf("shared/single ratio:    %.2fx\n", t_shared / t_single);

    return 0;
}

מה צריך לספק¶

קמפלו עם gcc -O2 -pthread -o false_sharing false_sharing.c והריצו.
תעדו: מה הזמן של single, shared, padded?
הסבירו: למה shared איטי יותר מsingle? (שני threads היו אמורים להיות מהירים פי 2, לא?)
הסבירו: מה הפרוטוקול MESI עושה בכל כתיבה במקרה shared?
הסבירו: למה padded פותר את הבעיה?

תוצאות צפויות (בקירוב)¶

single thread:          0.400 sec
two threads (shared):   2.500 sec  (איטי פי 6 מsingle!)
two threads (padded):   0.400 sec  (כמו single - שני threads עובדים במקביל)
shared/padded ratio:    6.25x
shared/single ratio:    6.25x

שלב 4 - מדידה עם perf¶

מה לעשות¶

השתמשו ב-perf stat כדי למדוד מוני חומרה (hardware counters) עבור כל אחד מהשלבים הקודמים. הריצו כל תוכנית עם perf וניתחו את התוצאות.

פקודות לדוגמה¶

# שלב 1 - cache misses
perf stat -e cache-misses,cache-references,\
L1-dcache-load-misses,L1-dcache-loads \
./cache_stride

# שלב 2 - branch mispredictions
perf stat -e branches,branch-misses,\
instructions,cycles \
./branch_test

# שלב 3 - false sharing
perf stat -e cache-misses,cache-references,\
L1-dcache-load-misses,instructions,cycles \
./false_sharing

# IPC (Instructions Per Cycle) לכל התוכניות
perf stat -e instructions,cycles ./program

מה צריך לספק¶

לכל תוכנית, תעדו את הפלט של perf וענו על השאלות:

שלב 1 (cache stride):
כמה L1-dcache-load-misses עם stride=1 לעומת stride=64?
מה היחס בין cache-references ל-cache-misses?
שלב 2 (branch prediction):
כמה branch-misses (באחוזים) עם מערך ממוין לעומת מעורבב?
מהו הIPC (instructions/cycles) בכל מקרה? מי גבוה יותר ולמה?
שלב 3 (false sharing):
כמה cache-misses עם shared לעומת padded?
מהו הIPC? למה הוא נמוך עם false sharing?
שאלה מסכמת: מבין שלושת הגורמים (cache misses, branch mispredictions, false sharing), איזה השפיע הכי הרבה על הביצועים בניסויים שלכם? מה היחס הגדול ביותר שמדדתם?

הנחיות להגשה¶

קוד מקור של כל 3 התוכניות (מותר להתבסס על הקוד ההתחלתי)
פלט של כל הרצה (זמנים + פלט perf)
מסמך הסבר (עמוד-עמודיים) שכולל:
תוצאות בטבלאות מסודרות
הסבר של כל תוצאה באמצעות הידע מפרק 8
תשובות לכל השאלות שלמעלה
מסקנות: מה למדתם על הקשר בין ארכיטקטורת מעבד לביצועי תוכנה

8.6 פרויקט סיכום פרויקט

פרויקט סיכום - ניתוח ביצועים - performance analysis¶

הקדמה¶

שלב 1 - זכרון מטמון ו-stride¶

מה לעשות¶

קוד התחלתי¶

מה צריך לספק¶

תוצאות צפויות (בקירוב)¶

שלב 2 - חיזוי הסתעפויות¶

מה לעשות¶

קוד התחלתי¶

מה צריך לספק¶

שלב 3 - שיתוף כוזב - false sharing¶

מה לעשות¶

קוד התחלתי¶

מה צריך לספק¶

תוצאות צפויות (בקירוב)¶

שלב 4 - מדידה עם perf¶

מה לעשות¶

פקודות לדוגמה¶

מה צריך לספק¶

הנחיות להגשה¶