It is not, there is no real reason why it would be particularly slow, it is just about switching virtual addresses and registers, which all OS have to perform anyway.
A quick-and-dirty benchmark:
#include <fcntl.h>
#include <semaphore.h>
#include <stdio.h>
#include <time.h>
#include <unistd.h>
#include <sys/mman.h>
sem_t *sem1, *sem2;
void worker1(void) {
time_t last;
int n = 0;
last = time(NULL);
while(1) {
time_t new = time(NULL);
if (new != last) {
printf("%d\n", n);
n = 0;
last = new;
}
n++;
sem_wait(sem1);
sem_post(sem2);
}
}
void worker2(void) {
while(1) {
sem_post(sem1);
sem_wait(sem2);
}
}
int fd;
void get_sems(void) {
void *ptr = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
sem1 = ptr;
sem2 = sem1+1;
}
int main(void) {
fd = open("/tmp/foo", O_CREAT|O_TRUNC|O_RDWR, 0666);
ftruncate(fd, 4096);
get_sems();
sem_init(sem1, 1, 0);
sem_init(sem2, 1, 0);
if (fork())
worker1();
else {
get_sems();
worker2();
}
}
run on my current Linux system (a Core i5-10210U), gets about 300k switches per second on Linux. Running it on Hurd-in-kvm (which would supposedly be slower) gets about 400k switches per second.