This file is indexed.

/usr/src/kernel-patches/lustre/patches/raid5-serialize-ovelapping-reqs.patch is in linux-patch-lustre 1.8.5+dfsg-3ubuntu1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
RAID5 wasn't designed to support overlapping requests because
in Linux all I/Os are serialized by page/buffer lock.  As Lustre
doesn't use pagecache on server, we need to serialize I/Os in RAID5.

Index: linux-2.6.9/include/linux/raid/raid5.h
===================================================================
--- linux-2.6.9.orig/include/linux/raid/raid5.h	2006-05-22 00:11:21.000000000 +0400
+++ linux-2.6.9/include/linux/raid/raid5.h	2006-05-22 00:11:21.000000000 +0400
@@ -134,6 +134,7 @@ struct stripe_head {
 	unsigned long		state;			/* state flags */
 	atomic_t		count;			/* nr of active thread/requests */
 	spinlock_t		lock;
+	wait_queue_head_t	wait;			/* waitchan for overlapped bio's */
 	struct r5dev {
 		struct bio	req;
 		struct bio_vec	vec;
Index: linux-2.6.9/drivers/md/raid5.c
===================================================================
--- linux-2.6.9.orig/drivers/md/raid5.c	2006-05-22 00:11:21.000000000 +0400
+++ linux-2.6.9/drivers/md/raid5.c	2006-05-22 00:19:27.000000000 +0400
@@ -308,6 +308,7 @@ static int grow_stripes(raid5_conf_t *co
 		memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
 		sh->raid_conf = conf;
 		sh->lock = SPIN_LOCK_UNLOCKED;
+		init_waitqueue_head(&sh->wait);
 
 		if (grow_buffers(sh, conf->raid_disks)) {
 			shrink_buffers(sh, conf->raid_disks);
@@ -878,6 +879,9 @@ static void compute_parity(struct stripe
 		set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
 	} else
 		clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+
+	/* probably someone waits for our completion? */
+	wake_up(&sh->wait);
 }
 
 /*
@@ -885,7 +889,7 @@ static void compute_parity(struct stripe
  * toread/towrite point to the first in a chain. 
  * The bi_next chain must be in order.
  */
-static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+static int add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
 {
 	struct bio **bip;
 	raid5_conf_t *conf = sh->raid_conf;
@@ -894,13 +898,21 @@ static void add_stripe_bio (struct strip
 		(unsigned long long)bi->bi_sector,
 		(unsigned long long)sh->sector);
 
-
 	spin_lock(&sh->lock);
 	spin_lock_irq(&conf->device_lock);
 	if (forwrite)
 		bip = &sh->dev[dd_idx].towrite;
 	else
 		bip = &sh->dev[dd_idx].toread;
+
+#if 1
+	if (*bip) {
+		/* overlapping bio, let's wait till first one is completed */
+		spin_unlock_irq(&conf->device_lock);
+		spin_unlock(&sh->lock);
+		return 1;
+	}
+#else
 	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
 		BUG_ON((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector);
 		bip = & (*bip)->bi_next;
@@ -910,6 +922,7 @@ static void add_stripe_bio (struct strip
 		BUG();
 	if (*bip)
 		bi->bi_next = *bip;
+#endif
 	*bip = bi;
 	bi->bi_phys_segments ++;
 	spin_unlock_irq(&conf->device_lock);
@@ -932,6 +945,7 @@ static void add_stripe_bio (struct strip
 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
 			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
 	}
+	return 0;
 }
 
 /*
@@ -1014,6 +1028,7 @@ static void handle_stripe(struct stripe_
 			rbi = dev->toread;
 			dev->toread = NULL;
 			spin_unlock_irq(&conf->device_lock);
+			wake_up(&sh->wait);
 			while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
 				copy_data(0, rbi, dev->page, dev->sector);
 				rbi2 = r5_next_bio(rbi, dev->sector);
@@ -1059,6 +1074,7 @@ static void handle_stripe(struct stripe_
 			bi = sh->dev[i].towrite;
 			sh->dev[i].towrite = NULL;
 			if (bi) to_write--;
+			wake_up(&sh->wait);
 
 			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
 				struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
@@ -1511,6 +1527,16 @@ static inline void raid5_plug_device(rai
 	spin_unlock_irq(&conf->device_lock);
 }
 
+static inline void raid5_wait_stripe(struct stripe_head *sh, int dd_idx, int forwrite)
+{
+	struct bio **bip;
+	if (forwrite)
+		bip = &sh->dev[dd_idx].towrite;
+	else
+		bip = &sh->dev[dd_idx].toread;
+	wait_event(sh->wait, *bip == NULL);
+}
+
 static int make_request (request_queue_t *q, struct bio * bi)
 {
 	mddev_t *mddev = q->queuedata;
@@ -1580,6 +1606,7 @@ repeat:
 				 * if we can't, then it's time to submit
 				 * all collected bio's in order to free
 				 * some space in the cache -bzzz */
+try_stripe:
 				sh = get_active_stripe(conf, new_sector, pd_idx, 1);
 				if (!sh && !(bi->bi_rw&RWA_MASK)) {
 					raid5_flush_bios(conf, bios, raid_disks);
@@ -1587,7 +1614,11 @@ repeat:
 				}
 			}
 			if (sh) {
-				add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
+				if (add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+					release_stripe(sh);
+					raid5_wait_stripe(sh, dd_idx, bi->bi_rw&RW_MASK);
+					goto try_stripe;
+				}
 			} else {
 				/* cannot get stripe for read-ahead, just give-up */
 				clear_bit(BIO_UPTODATE, &bi->bi_flags);