/usr/src/kernel-patches/lustre/patches/raid5-serialize-ovelapping-reqs.patch is in linux-patch-lustre 1.8.5+dfsg-3ubuntu1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | RAID5 wasn't designed to support overlapping requests because
in Linux all I/Os are serialized by page/buffer lock. As Lustre
doesn't use pagecache on server, we need to serialize I/Os in RAID5.
Index: linux-2.6.9/include/linux/raid/raid5.h
===================================================================
--- linux-2.6.9.orig/include/linux/raid/raid5.h 2006-05-22 00:11:21.000000000 +0400
+++ linux-2.6.9/include/linux/raid/raid5.h 2006-05-22 00:11:21.000000000 +0400
@@ -134,6 +134,7 @@ struct stripe_head {
unsigned long state; /* state flags */
atomic_t count; /* nr of active thread/requests */
spinlock_t lock;
+ wait_queue_head_t wait; /* waitchan for overlapped bio's */
struct r5dev {
struct bio req;
struct bio_vec vec;
Index: linux-2.6.9/drivers/md/raid5.c
===================================================================
--- linux-2.6.9.orig/drivers/md/raid5.c 2006-05-22 00:11:21.000000000 +0400
+++ linux-2.6.9/drivers/md/raid5.c 2006-05-22 00:19:27.000000000 +0400
@@ -308,6 +308,7 @@ static int grow_stripes(raid5_conf_t *co
memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
sh->raid_conf = conf;
sh->lock = SPIN_LOCK_UNLOCKED;
+ init_waitqueue_head(&sh->wait);
if (grow_buffers(sh, conf->raid_disks)) {
shrink_buffers(sh, conf->raid_disks);
@@ -878,6 +879,9 @@ static void compute_parity(struct stripe
set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
} else
clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+
+ /* probably someone waits for our completion? */
+ wake_up(&sh->wait);
}
/*
@@ -885,7 +889,7 @@ static void compute_parity(struct stripe
* toread/towrite point to the first in a chain.
* The bi_next chain must be in order.
*/
-static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+static int add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
{
struct bio **bip;
raid5_conf_t *conf = sh->raid_conf;
@@ -894,13 +898,21 @@ static void add_stripe_bio (struct strip
(unsigned long long)bi->bi_sector,
(unsigned long long)sh->sector);
-
spin_lock(&sh->lock);
spin_lock_irq(&conf->device_lock);
if (forwrite)
bip = &sh->dev[dd_idx].towrite;
else
bip = &sh->dev[dd_idx].toread;
+
+#if 1
+ if (*bip) {
+ /* overlapping bio, let's wait till first one is completed */
+ spin_unlock_irq(&conf->device_lock);
+ spin_unlock(&sh->lock);
+ return 1;
+ }
+#else
while (*bip && (*bip)->bi_sector < bi->bi_sector) {
BUG_ON((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector);
bip = & (*bip)->bi_next;
@@ -910,6 +922,7 @@ static void add_stripe_bio (struct strip
BUG();
if (*bip)
bi->bi_next = *bip;
+#endif
*bip = bi;
bi->bi_phys_segments ++;
spin_unlock_irq(&conf->device_lock);
@@ -932,6 +945,7 @@ static void add_stripe_bio (struct strip
if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
}
+ return 0;
}
/*
@@ -1014,6 +1028,7 @@ static void handle_stripe(struct stripe_
rbi = dev->toread;
dev->toread = NULL;
spin_unlock_irq(&conf->device_lock);
+ wake_up(&sh->wait);
while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
copy_data(0, rbi, dev->page, dev->sector);
rbi2 = r5_next_bio(rbi, dev->sector);
@@ -1059,6 +1074,7 @@ static void handle_stripe(struct stripe_
bi = sh->dev[i].towrite;
sh->dev[i].towrite = NULL;
if (bi) to_write--;
+ wake_up(&sh->wait);
while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
@@ -1511,6 +1527,16 @@ static inline void raid5_plug_device(rai
spin_unlock_irq(&conf->device_lock);
}
+static inline void raid5_wait_stripe(struct stripe_head *sh, int dd_idx, int forwrite)
+{
+ struct bio **bip;
+ if (forwrite)
+ bip = &sh->dev[dd_idx].towrite;
+ else
+ bip = &sh->dev[dd_idx].toread;
+ wait_event(sh->wait, *bip == NULL);
+}
+
static int make_request (request_queue_t *q, struct bio * bi)
{
mddev_t *mddev = q->queuedata;
@@ -1580,6 +1606,7 @@ repeat:
* if we can't, then it's time to submit
* all collected bio's in order to free
* some space in the cache -bzzz */
+try_stripe:
sh = get_active_stripe(conf, new_sector, pd_idx, 1);
if (!sh && !(bi->bi_rw&RWA_MASK)) {
raid5_flush_bios(conf, bios, raid_disks);
@@ -1587,7 +1614,11 @@ repeat:
}
}
if (sh) {
- add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
+ if (add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+ release_stripe(sh);
+ raid5_wait_stripe(sh, dd_idx, bi->bi_rw&RW_MASK);
+ goto try_stripe;
+ }
} else {
/* cannot get stripe for read-ahead, just give-up */
clear_bit(BIO_UPTODATE, &bi->bi_flags);
|