/usr/src/kernel-patches/lustre/patches/raid5-stripe-by-stripe-handling-rhel5.patch

diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
--- linux-2.6.18-53.orig/drivers/md/raid5.c	2007-12-28 14:55:08.000000000 +0800
+++ linux-2.6.18-53/drivers/md/raid5.c	2007-12-28 18:52:08.000000000 +0800
@@ -2626,6 +2626,35 @@ static int raid5_issue_flush(request_que
 	return ret;
 }
 
+static inline int raid5_expanding_overlap(raid5_conf_t *conf, struct bio *bi)
+{
+	sector_t first_sector, last_sector;
+
+	if (likely(conf->expand_progress == MaxSector))
+		return 0;
+
+	first_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+	last_sector = bi->bi_sector + (bi->bi_size>>9);
+
+	return (first_sector < conf->expand_progress &&
+		last_sector >= conf->expand_lo);
+}
+
+static inline int raid5_redo_bio(raid5_conf_t *conf, struct bio *bi, int disks, sector_t sector)
+{
+	int redo = 0;
+
+	if (likely(conf->expand_progress == MaxSector))
+		return 0;
+
+	spin_lock_irq(&conf->device_lock);
+	redo = (raid5_expanding_overlap(conf, bi) ||
+		(unlikely(sector < conf->expand_progress) &&
+		disks == conf->previous_raid_disks));
+	spin_unlock_irq(&conf->device_lock);
+	return redo;
+}
+
 static int make_request(request_queue_t *q, struct bio * bi)
 {
 	mddev_t *mddev = q->queuedata;
@@ -2636,6 +2665,14 @@ static int make_request(request_queue_t 
 	struct stripe_head *sh;
 	const int rw = bio_data_dir(bi);
 	int remaining;
+	sector_t stripe, sectors, block, r_sector, b_sector;
+	int sectors_per_chunk = conf->chunk_size >> 9;
+	int stripes_per_chunk, sectors_per_block;
+	int sectors_per_stripe;
+	int i, j;
+
+	DEFINE_WAIT(w);
+	int disks, data_disks;
 
 	atomic_inc(&conf->in_reqs_in_queue);
 
@@ -2653,105 +2690,136 @@ static int make_request(request_queue_t 
 	else
 		atomic_inc(&conf->reads_in);
 
-
 	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 	last_sector = bi->bi_sector + (bi->bi_size>>9);
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
 
-	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
-		DEFINE_WAIT(w);
-		int disks, data_disks;
-
-	retry:
-		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
-		if (likely(conf->expand_progress == MaxSector))
-			disks = conf->raid_disks;
-		else {
-			/* spinlock is needed as expand_progress may be
-			 * 64bit on a 32bit platform, and so it might be
-			 * possible to see a half-updated value
-			 * Ofcourse expand_progress could change after
-			 * the lock is dropped, so once we get a reference
-			 * to the stripe that we think it is, we will have
-			 * to check again.
-			 */
-			spin_lock_irq(&conf->device_lock);
-			disks = conf->raid_disks;
-			if (logical_sector >= conf->expand_progress)
-				disks = conf->previous_raid_disks;
-			else {
-				if (logical_sector >= conf->expand_lo) {
-					spin_unlock_irq(&conf->device_lock);
-					schedule();
-					goto retry;
-				}
-			}
-			spin_unlock_irq(&conf->device_lock);
-		}
-		data_disks = disks - conf->max_degraded;
+	sectors = bi->bi_size >> 9;
+	stripes_per_chunk = conf->chunk_size / STRIPE_SIZE;
 
- 		new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
-						  &dd_idx, &pd_idx, conf);
-		PRINTK("raid5: make_request, sector %llu logical %llu\n",
-			(unsigned long long)new_sector, 
-			(unsigned long long)logical_sector);
+redo_bio:
+	/* stripe by stripe handle needs a stable raid layout, so if this
+	 * reuqest covers the expanding region, wait it over. 
+	 * Furthermore, we may get here with partial request handled, so
+	 * wait for the bi_phys_segment to be 1 also. -jay */
+	spin_lock_irq(&conf->device_lock);
+	wait_event_lock_irq(conf->wait_for_overlap,
+			(bi->bi_phys_segments == 1) &&
+			!raid5_expanding_overlap(conf, bi),
+			conf->device_lock,
+			(unplug_slaves(conf->mddev), atomic_inc(&conf->expanding_overlap)));
+
+	disks = conf->raid_disks;
+	if (unlikely(logical_sector >= conf->expand_progress))
+		disks = conf->previous_raid_disks;
+	data_disks = disks - conf->max_degraded;
+	spin_unlock_irq(&conf->device_lock);
 
-		sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
-		if (sh) {
-			if (unlikely(conf->expand_progress != MaxSector)) {
-				/* expansion might have moved on while waiting for a
-				 * stripe, so we must do the range check again.
-				 * Expansion could still move past after this
-				 * test, but as we are holding a reference to
-				 * 'sh', we know that if that happens,
-				 *  STRIPE_EXPANDING will get set and the expansion
-				 * won't proceed until we finish with the stripe.
-				 */
-				int must_retry = 0;
-				spin_lock_irq(&conf->device_lock);
-				if (logical_sector <  conf->expand_progress &&
-				    disks == conf->previous_raid_disks)
-					/* mismatch, need to try again */
-					must_retry = 1;
-				spin_unlock_irq(&conf->device_lock);
-				if (must_retry) {
-					release_stripe(sh);
-					goto retry;
+	/* compute the block # */
+	sectors_per_stripe = STRIPE_SECTORS * data_disks;
+	sectors_per_block = stripes_per_chunk * sectors_per_stripe;
+
+	block = logical_sector & ~((sector_t)sectors_per_block - 1);
+	sector_div(block, sectors_per_block);
+
+repeat:
+	stripe = block * (sectors_per_block / data_disks);
+	b_sector = stripe * data_disks;
+	/* iterate through all stripes in this block,
+	 * where block is a set of internal stripes
+	 * which covers chunk */
+
+	for (i = 0; i < stripes_per_chunk && sectors > 0; i++) {
+		r_sector = b_sector + (i * STRIPE_SECTORS);
+		sh = NULL;
+		/* iterrate through all pages in the stripe */
+		for (j = 0; j < data_disks && sectors > 0; j++) {
+			DEFINE_WAIT(w);
+
+			if (r_sector + STRIPE_SECTORS <= bi->bi_sector ||
+			    r_sector >= last_sector) {
+				r_sector += sectors_per_chunk;
+				continue;
+			}
+
+retry:
+			prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
+			new_sector = raid5_compute_sector(r_sector, disks,
+							data_disks, &dd_idx,
+							&pd_idx, conf);
+			if (sh == NULL) {
+				sh = get_active_stripe(conf, new_sector, disks, pd_idx,
+							(bi->bi_rw&RWA_MASK));
+				if (sh) {
+					/* we're handling the bio stripe by stripe, so when we found
+					 * the raid layout has been changed, we have to redo the 
+					 * whole bio because we don't which sectors in it has been
+					 * done, and which is not done. -jay */
+					if (raid5_redo_bio(conf, bi, disks, logical_sector))
+						goto redo_bio;
+
+					if (test_bit(STRIPE_EXPANDING, &sh->state)) {
+						/* Stripe is busy expanding or
+						 * add failed due to overlap.  Flush everything
+						 * and wait a while
+						 */
+						release_stripe(sh);
+						sh = NULL;
+						raid5_unplug_device(mddev->queue);
+						schedule();
+						goto retry;
+					}
+				} else {
+					/* cannot get stripe for read-ahead, just give-up */
+					finish_wait(&conf->wait_for_overlap, &w);
+					clear_bit(BIO_UPTODATE, &bi->bi_flags);
+					sectors = 0;
+					break;
 				}
 			}
+
 			/* FIXME what if we get a false positive because these
 			 * are being updated.
 			 */
-			if (logical_sector >= mddev->suspend_lo &&
-			    logical_sector < mddev->suspend_hi) {
+			if (r_sector >= mddev->suspend_lo &&
+			    r_sector < mddev->suspend_hi) {
+				handle_stripe(sh, NULL);
 				release_stripe(sh);
+				sh = NULL;
 				schedule();
 				goto retry;
 			}
 
-			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
-			    !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
-				/* Stripe is busy expanding or
-				 * add failed due to overlap.  Flush everything
-				 * and wait a while
-				 */
-				raid5_unplug_device(mddev->queue);
+			if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+				handle_stripe(sh, NULL);
 				release_stripe(sh);
+				sh = NULL;
+				raid5_unplug_device(mddev->queue);
 				schedule();
 				goto retry;
 			}
 			finish_wait(&conf->wait_for_overlap, &w);
+
+			BUG_ON (new_sector != stripe);
+			sectors -= STRIPE_SECTORS;
+			if (bi->bi_sector > r_sector)
+				sectors += bi->bi_sector - r_sector;
+			if (r_sector + STRIPE_SECTORS > last_sector)
+				sectors += r_sector + STRIPE_SECTORS - last_sector;
+			r_sector += sectors_per_chunk;
+		}
+		if (sh) {
 			handle_stripe(sh, NULL);
 			release_stripe(sh);
-		} else {
-			/* cannot get stripe for read-ahead, just give-up */
-			clear_bit(BIO_UPTODATE, &bi->bi_flags);
-			finish_wait(&conf->wait_for_overlap, &w);
-			break;
+			sh = NULL;
 		}
-			
+		stripe += STRIPE_SECTORS;
 	}
+	block++;
+	if (sectors > 0)
+		goto repeat;
+
 	spin_lock_irq(&conf->device_lock);
 	remaining = --bi->bi_phys_segments;
 	spin_unlock_irq(&conf->device_lock);
@@ -3439,6 +3507,8 @@ static void status (struct seq_file *seq
 			atomic_read(&conf->active_stripes),
 			atomic_read(&conf->in_reqs_in_queue),
 			atomic_read(&conf->out_reqs_in_queue));
+	seq_printf (seq, "\t\t%u expanding overlap\n",
+			atomic_read(&conf->expanding_overlap));
 #if RAID5_DEBUG
 	seq_printf (seq, "\n");
 	printall(seq, conf);
diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
--- linux-2.6.18-53.orig/include/linux/raid/raid5.h	2007-12-28 14:55:08.000000000 +0800
+++ linux-2.6.18-53/include/linux/raid/raid5.h	2007-12-28 18:09:37.000000000 +0800
@@ -278,6 +278,7 @@ struct raid5_private_data {
 	atomic_t		bit_delayed;
 	atomic_t		in_reqs_in_queue;
 	atomic_t		out_reqs_in_queue;
+	atomic_t		expanding_overlap;
 };
 
 typedef struct raid5_private_data raid5_conf_t;
linux-patch-lustre 1.8.5+dfsg-3ubuntu1 / usr / src / kernel-patches / lustre / patches / raid5-stripe-by-stripe-handling-rhel5.patch