10.1 אסמבלי בתוך C פתרון

פתרון - אסמבלי בתוך C - inline assembly¶

פתרון 1 - קריאת TSC ומדידת ביצועים¶

#include <stdio.h>
#include <stdint.h>

static inline uint64_t read_tsc(void)
{
    uint32_t lo, hi;
    asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
    return ((uint64_t)hi << 32) | lo;
}

int main(void)
{
    volatile int a = 100, b = 200;
    volatile int result;
    uint64_t start, end;

    // מדידת חיבור
    start = read_tsc();
    result = a + b;
    end = read_tsc();
    printf("addition: %lu cycles\n", end - start);

    // מדידת printf
    start = read_tsc();
    printf("test\n");
    end = read_tsc();
    printf("printf: %lu cycles\n", end - start);

    // מדידת לולאה
    start = read_tsc();
    volatile int sum = 0;
    for (int i = 0; i < 1000; i++)
        sum += i;
    end = read_tsc();
    printf("loop 1000: %lu cycles\n", end - start);

    return 0;
}

שימו לב שהמשתנים מוגדרים כ-volatile כדי שהקומפיילר לא יבצע אופטימיזציות ויסיר את החישובים.

פתרון 2 - זיהוי יצרן המעבד עם CPUID¶

#include <stdio.h>
#include <stdint.h>
#include <string.h>

void get_cpu_vendor(char *vendor)
{
    uint32_t ebx, ecx, edx;
    asm("cpuid"
        : "=b"(ebx), "=c"(ecx), "=d"(edx)
        : "a"(0)
    );

    *((uint32_t *)vendor)     = ebx;
    *((uint32_t *)(vendor+4)) = edx;
    *((uint32_t *)(vendor+8)) = ecx;
    vendor[12] = '\0';
}

void get_cpu_brand(char *brand)
{
    uint32_t regs[4];

    for (int i = 0; i < 3; i++) {
        asm("cpuid"
            : "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
            : "a"(0x80000002 + i)
        );
        memcpy(brand + i * 16,      &regs[0], 4);
        memcpy(brand + i * 16 + 4,  &regs[1], 4);
        memcpy(brand + i * 16 + 8,  &regs[2], 4);
        memcpy(brand + i * 16 + 12, &regs[3], 4);
    }
    brand[48] = '\0';
}

int main(void)
{
    char vendor[13];
    char brand[49];

    get_cpu_vendor(vendor);
    get_cpu_brand(brand);

    printf("CPU vendor: %s\n", vendor);
    printf("CPU brand:  %s\n", brand);

    return 0;
}

הפונקציה get_cpu_brand קוראת ל-cpuid שלוש פעמים. בכל קריאה, ארבעת הרגיסטרים eax, ebx, ecx, edx מחזירים 16 בתים של מחרוזת שם המעבד - סך הכל 48 תווים. שלושת הערכים 0x80000002, 0x80000003, 0x80000004 הם הfunction IDs שמחזירים את שלושת חלקי השם.

פתרון 3 - חיבור וחיסור עם inline asm¶

#include <stdio.h>

int asm_add(int a, int b)
{
    int result;
    asm("addl %2, %0"
        : "=r"(result)
        : "0"(a), "r"(b)
        : "cc"
    );
    return result;
}

int asm_sub(int a, int b)
{
    int result;
    asm("subl %2, %0"
        : "=r"(result)
        : "0"(a), "r"(b)
        : "cc"
    );
    return result;
}

int asm_mul(int a, int b)
{
    int result;
    asm("imull %2, %0"
        : "=r"(result)
        : "0"(a), "r"(b)
        : "cc"
    );
    return result;
}

int main(void)
{
    printf("add(3, 4)   = %d\n", asm_add(3, 4));     // 7
    printf("add(-5, 3)  = %d\n", asm_add(-5, 3));     // -2
    printf("sub(10, 3)  = %d\n", asm_sub(10, 3));     // 7
    printf("sub(3, 10)  = %d\n", asm_sub(3, 10));     // -7
    printf("mul(6, 7)   = %d\n", asm_mul(6, 7));      // 42
    printf("mul(-3, 4)  = %d\n", asm_mul(-3, 4));     // -12
    printf("mul(0, 100) = %d\n", asm_mul(0, 100));    // 0

    return 0;
}

שימו לב לconstraint "0" בקלט הראשון. הוא אומר לקומפיילר: "שים את a באותו רגיסטר כמו אופרנד 0 (result)". זה נחוץ כי addl %2, %0 כותב את התוצאה לאופרנד 0 - אז אופרנד 0 צריך להכיל את הערך הראשוני של a.

פתרון 4 - קריאת מערכת ישירה - write¶

#include <stdio.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>

static inline long my_write(int fd, const char *buf, long len)
{
    long ret;
    asm volatile(
        "syscall"
        : "=a"(ret)
        : "a"((long)1),    // syscall number: write = 1
          "D"((long)fd),    // rdi = fd
          "S"(buf),         // rsi = buf
          "d"(len)          // rdx = len
        : "rcx", "r11", "memory"
    );
    return ret;
}

int main(void)
{
    // הדפסה ל-stdout
    const char msg[] = "hello from my_write!\n";
    my_write(1, msg, strlen(msg));

    // כתיבה לקובץ
    int fd = open("test_output.txt", O_WRONLY | O_CREAT | O_TRUNC, 0644);
    if (fd < 0) {
        perror("open");
        return 1;
    }

    const char file_msg[] = "this was written with inline asm syscall\n";
    long bytes = my_write(fd, file_msg, strlen(file_msg));
    printf("wrote %ld bytes to file\n", bytes);

    close(fd);
    return 0;
}

שימו לב שהcasts ל-(long) חשובים כדי להבטיח שהערכים יהיו 64-ביט (ב-x86_64, הרגיסטרים הם 64-ביט). בלי הcasts, ערך 32-ביט עלול לא למלא את כל הרגיסטר.

פתרון 5 - compare-and-swap¶

#include <stdio.h>
#include <stdint.h>

static inline int cas(volatile int *ptr, int expected, int desired)
{
    int result;
    asm volatile(
        "lock cmpxchgl %3, %1\n\t"
        "sete %%cl\n\t"
        "movzbl %%cl, %0"
        : "=r"(result), "+m"(*ptr), "+a"(expected)
        : "r"(desired)
        : "cl", "cc", "memory"
    );
    return result;
}

void atomic_increment(volatile int *counter)
{
    int old_val, new_val;
    do {
        old_val = *counter;
        new_val = old_val + 1;
    } while (!cas(counter, old_val, new_val));
}

int main(void)
{
    int val = 5;

    // CAS עם ערך נכון - צריך להצליח
    printf("before: val = %d\n", val);
    if (cas(&val, 5, 10))
        printf("CAS(5->10) succeeded, val = %d\n", val);
    else
        printf("CAS(5->10) failed, val = %d\n", val);

    // CAS עם ערך לא נכון - צריך להיכשל
    if (cas(&val, 5, 20))
        printf("CAS(5->20) succeeded, val = %d\n", val);
    else
        printf("CAS(5->20) failed, val = %d\n", val);

    // בונוס: מונה אטומי
    volatile int counter = 0;
    for (int i = 0; i < 100; i++)
        atomic_increment(&counter);
    printf("counter after 100 increments: %d\n", counter); // 100

    return 0;
}

הפונקציה atomic_increment עובדת בלולאת CAS: היא קוראת את הערך הנוכחי, מחשבת את הערך החדש (הנוכחי + 1), ומנסה להחליף. אם thread אחר שינה את הערך בינתיים, הCAS ייכשל והלולאה תנסה שוב עם הערך המעודכן.

הprefix lock חיוני כאן - בלעדיו הפקודה לא תהיה אטומית ושני threads יכולים לקרוא את אותו הערך ולדרוס אחד את השני.