Does calling pwrite from multiple threads DECREASE throughput on linux?

> 奥さん
コメント有難うございます。そういう事なら確かにpwriteでもっと性能出て欲しいですね。単体のベンチマークソースコードとかって晒して貰う事は出来ますでしょうか?

http://d.hatena.ne.jp/nishidakeisuke/20080528/p1#c1211944915

おもしろいかもと思ったのでやってみた... あれ? 複数スレッドから書くとスループットが大幅に低下する? ほんとなんだろうか。

[kazuho@dev32 experiments]$ uname -a
Linux dev32.pathtraq.com 2.6.18-53.1.14.el5 #1 SMP Wed Mar 5 11:37:38 EST 2008 x86_64 x86_64 x86_64 GNU/Linux
[kazuho@dev32 experiments]$ time ./a.out hogehoge 1
start writing at 0 for 4194304 times

real    0m8.819s
user    0m0.470s
sys     0m8.348s
[kazuho@dev32 experiments]$ time ./a.out hogehoge 2
start writing at 32 for 2097152 times
start writing at 0 for 2097152 times

real    0m14.032s
user    0m0.595s
sys     0m19.210s
[kazuho@dev32 experiments]$ time ./a.out hogehoge 3
start writing at 0 for 1398101 times
start writing at 32 for 1398101 times
start writing at 64 for 1398101 times

real    0m11.873s
user    0m0.493s
sys     0m15.442s
[kazuho@dev32 experiments]$ time ./a.out hogehoge 4
start writing at 0 for 1048576 times
start writing at 64 for 1048576 times
start writing at 32 for 1048576 times
start writing at 96 for 1048576 times

real    0m13.759s
user    0m0.595s
sys     0m18.759s

CPU は Opteron 2218 x2 です。ちなみに手元の MacBook (Core 2 Duo @ 2GHz, 10.4.11) だと、スケールするっぽい。

[kazuho@KazMac:~/dev/experiments]$ time ./a.out hogehoge 1
start writing at 0 for 4194304 times

real    0m14.699s
user    0m3.191s
sys     0m11.453s
[kazuho@KazMac:~/dev/experiments]$ time ./a.out hogehoge 2
start writing at 0 for 2097152 times
start writing at 32 for 2097152 times

real    0m10.577s
user    0m3.189s
sys     0m16.751s

以下がソースコード

#define _XOPEN_SOURCE 500

#include <alloca.h>
#include <assert.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#define LOOP (4 * 1024 * 1024)

static int fd;
unsigned num_threads;

static void *write_loop(void *_off)
{
  int off = (int)_off;
  ssize_t l;
  int i;
  
  fprintf(stderr, "start writing at %d for %d times\n", off,
          LOOP / num_threads);
  for (i = LOOP / num_threads - 1; i >= 0; i--) {
    l = pwrite(fd, "\1", 1, off);
    assert(l == 1);
  }
  
  return NULL;
}

int main(int argc, char **argv)
{
  pthread_t *thr;
  int i, err;
  
  if (argc != 3
      || sscanf(argv[2], "%u", &num_threads) != 1) {
    fprintf(stderr, "Usage: %s test_file num_threads\n", argv[0]);
    exit(1);
  }
  thr = alloca(sizeof(pthread_t) * num_threads);
  
  fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0666);
  assert(fd != -1);
  err = ftruncate(fd, 1024);
  assert(err == 0);
  
  for (i = 0; i < num_threads; i++) {
    pthread_create(thr + i, NULL, write_loop, (void*)(i * 32));
  }
  for (i = 0; i < num_threads; i++) {
    pthread_join(thr[i], NULL);
  }
  
  return 0;
}

PS. デフォルトのカーネルだと (ry みたいな話だったりたらどうしよう (汗