/usr/src/kernel-patches/lustre/patches/raid5-zerocopy-rhel5.patch

Index: linux-2.6.18-128.1.14/drivers/md/raid5.c
===================================================================
--- linux-2.6.18-128.1.14.orig/drivers/md/raid5.c	2009-06-19 12:34:46.000000000 -0600
+++ linux-2.6.18-128.1.14/drivers/md/raid5.c	2009-06-19 12:34:50.000000000 -0600
@@ -633,6 +633,9 @@
 		clear_buffer_uptodate(bh);
 	}
 #endif
+	/* Read on a Directing write is allowable */
+	/* BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)) */
+	BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page != sh->dev[i].page);
 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
@@ -669,6 +672,10 @@
 
 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
 	
+	if (test_bit(R5_Direct, &sh->dev[i].flags)) {
+		BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
+		sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
+	}
 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
@@ -910,7 +917,27 @@
 	return r_sector;
 }
 
+static struct page *zero_copy_data(struct bio *bio, sector_t sector)
+{
+	sector_t bi_sector = bio->bi_sector;
+	struct page *page = NULL;
+	struct bio_vec *bvl;
+	int i;
 
+	bio_for_each_segment(bvl, bio, i) {
+		if (sector == bi_sector)
+			page = bio_iovec_idx(bio, i)->bv_page;
+		bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
+		if (bi_sector >= sector + STRIPE_SECTORS) {
+			/* check if the stripe is covered by one page */
+			if (page == bio_iovec_idx(bio, i)->bv_page &&
+			    PageConstant(page))
+				return page;
+			return NULL;
+		}
+	}
+	return NULL;
+}
 
 /*
  * Copy data between a page in the stripe cache, and one or more bion
@@ -1002,8 +1029,9 @@
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
-	void *ptr[MAX_XOR_BLOCKS];
+	void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
 	struct bio *chosen;
+	struct page *page;
 
 	PRINTK("compute_parity5, stripe %llu, method %d\n",
 		(unsigned long long)sh->sector, method);
@@ -1053,34 +1081,92 @@
 		count = 1;
 	}
 	
-	for (i = disks; i--;)
-		if (sh->dev[i].written) {
-			sector_t sector = sh->dev[i].sector;
-			struct bio *wbi = sh->dev[i].written;
-			while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-				copy_data(1, wbi, sh->dev[i].page, sector);
-				wbi = r5_next_bio(wbi, sector);
+	for (i = disks; i--;) {
+		struct r5dev *dev = &sh->dev[i];
+		struct bio *wbi = dev->written;
+		sector_t sector;
+
+		if (!wbi)
+			continue;
+
+		sector = dev->sector;
+		set_bit(R5_LOCKED, &sh->dev[i].flags);
+		BUG_ON(test_bit(R5_Direct, &dev->flags));
+
+		/* check if it's covered by a single page
+		   and whole stripe is written at once.
+		 * in this case we can avoid memcpy() */
+		if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) &&
+		    test_bit(R5_Insync, &dev->flags)) {
+			page = zero_copy_data(wbi, sector);
+			if (page) {
+				atomic_inc(&conf->writes_zcopy);
+				/* The pointer must be restored whenever the LOCKED
+				 * gets cleared. */
+				dev->req.bi_io_vec[0].bv_page = page;
+				set_bit(R5_Direct, &dev->flags);
+				clear_bit(R5_UPTODATE, &sh->dev[i].flags);
+				clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
+				continue;
 			}
+		}
 
-			set_bit(R5_LOCKED, &sh->dev[i].flags);
-			set_bit(R5_UPTODATE, &sh->dev[i].flags);
+		/* do copy write */
+		atomic_inc(&conf->writes_copied);
+		clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
+		set_bit(R5_UPTODATE, &sh->dev[i].flags);
+		while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+			copy_data(1, wbi, sh->dev[i].page, sector);
+			wbi = r5_next_bio(wbi, sector);
 		}
+	}
 
+	h_ptr[0] = ptr[0];
 	switch(method) {
 	case RECONSTRUCT_WRITE:
 	case CHECK_PARITY:
-		for (i=disks; i--;)
-			if (i != pd_idx) {
-				ptr[count++] = page_address(sh->dev[i].page);
-				check_xor();
+		for (i=disks; i--;) {
+			if (i == pd_idx)
+				continue;
+			if (test_bit(R5_Direct, &sh->dev[i].flags))
+				page = sh->dev[i].req.bi_io_vec[0].bv_page;
+			else
+				page = sh->dev[i].page;
+
+			/* have to compute the parity immediately for
+			 * a highmem page. it would happen for zerocopy. -jay
+			 */
+			if (PageHighMem(page)) {
+				h_ptr[1] = kmap_atomic(page, KM_USER0);
+				xor_block(2, STRIPE_SIZE, h_ptr);
+				kunmap_atomic(page, KM_USER0);
+			} else {
+				ptr[count++] = page_address(page);
 			}
+			check_xor();
+		}
 		break;
 	case READ_MODIFY_WRITE:
-		for (i = disks; i--;)
-			if (sh->dev[i].written) {
-				ptr[count++] = page_address(sh->dev[i].page);
-				check_xor();
+		for (i = disks; i--;) {
+			if (!sh->dev[i].written)
+				continue;
+			if (test_bit(R5_Direct, &sh->dev[i].flags))
+				page = sh->dev[i].req.bi_io_vec[0].bv_page;
+			else
+				page = sh->dev[i].page;
+
+			/* have to compute the parity immediately for
+			 * a highmem page. it would happen for zerocopy. -jay
+			 */
+			if (PageHighMem(page)) {
+				h_ptr[1] = kmap_atomic(page, KM_USER0);
+				xor_block(2, STRIPE_SIZE, h_ptr);
+				kunmap_atomic(page, KM_USER0);
+			} else {
+				ptr[count++] = page_address(page);
 			}
+			check_xor();
+		}
 	}
 	if (count != 1)
 		xor_block(count, STRIPE_SIZE, ptr);
@@ -1097,6 +1183,7 @@
 	raid6_conf_t *conf = sh->raid_conf;
 	int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
 	struct bio *chosen;
+	struct page *page;
 	/**** FIX THIS: This could be very bad if disks is close to 256 ****/
 	void *ptrs[disks];
 
@@ -1126,18 +1213,49 @@
 		BUG();		/* Not implemented yet */
 	}
 
-	for (i = disks; i--;)
-		if (sh->dev[i].written) {
-			sector_t sector = sh->dev[i].sector;
-			struct bio *wbi = sh->dev[i].written;
-			while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-				copy_data(1, wbi, sh->dev[i].page, sector);
-				wbi = r5_next_bio(wbi, sector);
+	for (i = disks; i--;) {
+		struct r5dev *dev = &sh->dev[i];
+		struct bio *wbi = dev->written;
+		sector_t sector;
+
+		if (!wbi)
+			continue;
+
+		sector = sh->dev[i].sector;
+		set_bit(R5_LOCKED, &sh->dev[i].flags);
+		BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
+
+		/* check if it's covered by a single page
+		 * and whole stripe is written at once.
+		 * in this case we can avoid memcpy() */
+		if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
+		    test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
+			page = zero_copy_data(wbi, sector);
+			/* we don't do zerocopy on a HighMem page. Raid6 tend
+			 * to prepare all of the pages' content to be accessed
+			 * before computing PQ parity. If we need to support HighMem
+			 * page also, we have to modify the gen_syndrome()
+			 * algorithm. -jay */
+			if (page && !PageHighMem(page)) {
+				atomic_inc(&conf->writes_zcopy);
+				/* The pointer must be restored whenever the LOCKED
+				 * gets cleared. */
+				sh->dev[i].req.bi_io_vec[0].bv_page = page;
+				set_bit(R5_Direct, &sh->dev[i].flags);
+				clear_bit(R5_UPTODATE, &sh->dev[i].flags);
+				clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
+				continue;
 			}
+		}
 
-			set_bit(R5_LOCKED, &sh->dev[i].flags);
-			set_bit(R5_UPTODATE, &sh->dev[i].flags);
+		atomic_inc(&conf->writes_copied);
+		clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
+		set_bit(R5_UPTODATE, &sh->dev[i].flags);
+		while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+			copy_data(1, wbi, sh->dev[i].page, sector);
+			wbi = r5_next_bio(wbi, sector);
 		}
+	}
 
 //	switch(method) {
 //	case RECONSTRUCT_WRITE:
@@ -1148,8 +1266,12 @@
 		count = 0;
 		i = d0_idx;
 		do {
-			ptrs[count++] = page_address(sh->dev[i].page);
-			if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
+			if (test_bit(R5_Direct, &sh->dev[i].flags))
+				ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
+			else
+				ptrs[count++] = page_address(sh->dev[i].page);
+			if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
+			    !test_bit(R5_Direct, &sh->dev[i].flags))
 				printk("block %d/%d not uptodate on parity calc\n", i,count);
 			i = raid6_next_disk(i, disks);
 		} while ( i != d0_idx );
@@ -1596,7 +1718,8 @@
 		if (sh->dev[i].written) {
 		    dev = &sh->dev[i];
 		    if (!test_bit(R5_LOCKED, &dev->flags) &&
-			 test_bit(R5_UPTODATE, &dev->flags) ) {
+			 (test_bit(R5_UPTODATE, &dev->flags) ||
+			  test_bit(R5_Direct, &dev->flags)) ) {
 			/* We can return any write requests */
 			    struct bio *wbi, *wbi2;
 			    int bitmap_end = 0;
@@ -1604,6 +1727,7 @@
 			    spin_lock_irq(&conf->device_lock);
 			    wbi = dev->written;
 			    dev->written = NULL;
+			    clear_bit(R5_Direct, &dev->flags);
 			    while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
 				    wbi2 = r5_next_bio(wbi, dev->sector);
 				    if (--wbi->bi_phys_segments == 0) {
@@ -1967,6 +2091,15 @@
 				set_bit(STRIPE_DEGRADED, &sh->state);
 			PRINTK("skip op %ld on disc %d for sector %llu\n",
 				bi->bi_rw, i, (unsigned long long)sh->sector);
+
+			if (test_bit(R5_Direct, &sh->dev[i].flags)) {
+				/* restore the page pointer of req, otherwise,
+				 * no any read is permitted on this stripe, this is
+				 * not what we want. -jay */
+				BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
+				sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
+			}
+
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
@@ -2172,7 +2305,8 @@
 			if (sh->dev[i].written) {
 				dev = &sh->dev[i];
 				if (!test_bit(R5_LOCKED, &dev->flags) &&
-				    test_bit(R5_UPTODATE, &dev->flags) ) {
+				    (test_bit(R5_UPTODATE, &dev->flags) ||
+				     test_bit(R5_Direct, &dev->flags)) ) {
 					/* We can return any write requests */
 					int bitmap_end = 0;
 					struct bio *wbi, *wbi2;
@@ -2181,6 +2315,7 @@
 					spin_lock_irq(&conf->device_lock);
 					wbi = dev->written;
 					dev->written = NULL;
+					clear_bit(R5_Direct, &dev->flags);
 					while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
 						wbi2 = r5_next_bio(wbi, dev->sector);
 						if (--wbi->bi_phys_segments == 0) {
@@ -2532,6 +2667,15 @@
 				set_bit(STRIPE_DEGRADED, &sh->state);
 			PRINTK("skip op %ld on disc %d for sector %llu\n",
 				bi->bi_rw, i, (unsigned long long)sh->sector);
+
+			if (test_bit(R5_Direct, &sh->dev[i].flags)) {
+				/* restore the page pointer of req, otherwise,
+				 * no any read is permitted on this stripe, this is
+				 * not what we want. -jay */
+				BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
+				sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
+			}
+
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
@@ -3449,6 +3593,9 @@
 	mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
 	mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
 
+	/* raid5 device is able to do zcopy right now. */
+	mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE;
+
 	return 0;
 abort:
 	if (conf) {
@@ -3535,9 +3682,11 @@
 			atomic_read(&conf->handled_in_raid5d),
 			atomic_read(&conf->out_of_stripes),
 			atomic_read(&conf->handle_called));
-	seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
+	seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
 			atomic_read(&conf->reads_for_rmw),
-			atomic_read(&conf->reads_for_rcw));
+			atomic_read(&conf->reads_for_rcw),
+			atomic_read(&conf->writes_zcopy),
+			atomic_read(&conf->writes_copied));
 	seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
 			atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
 			atomic_read(&conf->active_stripes),
Index: linux-2.6.18-128.1.14/include/linux/backing-dev.h
===================================================================
--- linux-2.6.18-128.1.14.orig/include/linux/backing-dev.h	2009-06-19 12:33:11.000000000 -0600
+++ linux-2.6.18-128.1.14/include/linux/backing-dev.h	2009-06-19 12:34:50.000000000 -0600
@@ -48,6 +48,7 @@
 #define BDI_CAP_READ_MAP	0x00000010	/* Can be mapped for reading */
 #define BDI_CAP_WRITE_MAP	0x00000020	/* Can be mapped for writing */
 #define BDI_CAP_EXEC_MAP	0x00000040	/* Can be mapped for execution */
+#define BDI_CAP_PAGE_CONSTANT_WRITE	0x00000080	/* Zcopy write - for raid5 */
 #define BDI_CAP_VMFLAGS \
 	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
 
@@ -94,11 +95,18 @@
 #define bdi_cap_account_dirty(bdi) \
 	(!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
 
+#define bdi_cap_page_constant_write(bdi) \
+	((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE)
+
 #define mapping_cap_writeback_dirty(mapping) \
 	bdi_cap_writeback_dirty((mapping)->backing_dev_info)
 
 #define mapping_cap_account_dirty(mapping) \
 	bdi_cap_account_dirty((mapping)->backing_dev_info)
 
+#define mapping_cap_page_constant_write(mapping) \
+	bdi_cap_page_constant_write((mapping)->backing_dev_info)
+	
+
 
 #endif		/* _LINUX_BACKING_DEV_H */
Index: linux-2.6.18-128.1.14/include/linux/page-flags.h
===================================================================
--- linux-2.6.18-128.1.14.orig/include/linux/page-flags.h	2009-06-19 12:33:11.000000000 -0600
+++ linux-2.6.18-128.1.14/include/linux/page-flags.h	2009-06-19 12:34:50.000000000 -0600
@@ -87,6 +87,7 @@
 #define PG_reclaim		17	/* To be reclaimed asap */
 #define PG_nosave_free		18	/* Free, should not be written */
 #define PG_buddy		19	/* Page is free, on buddy lists */
+#define PG_constant		21	/* To mark if the page is constant */
 #define PG_xpmem		27	/* Testing for xpmem. */
 
 /* PG_owner_priv_1 users should have descriptive aliases */
@@ -288,6 +289,14 @@
 
 struct page;	/* forward declaration */
 
+#define PageConstant(page) 	test_bit(PG_constant, &(page)->flags)
+#define SetPageConstant(page) 	set_bit(PG_constant, &(page)->flags)
+#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
+#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
+
+extern int set_page_constant(struct page *page);
+extern void clear_page_constant(struct page *);
+
 int test_clear_page_dirty(struct page *page);
 int test_clear_page_writeback(struct page *page);
 int test_set_page_writeback(struct page *page);
Index: linux-2.6.18-128.1.14/include/linux/raid/raid5.h
===================================================================
--- linux-2.6.18-128.1.14.orig/include/linux/raid/raid5.h	2009-06-19 12:34:42.000000000 -0600
+++ linux-2.6.18-128.1.14/include/linux/raid/raid5.h	2009-06-19 12:34:50.000000000 -0600
@@ -156,8 +156,9 @@
 #define	R5_Overlap	7	/* There is a pending overlapping request on this block */
 #define	R5_ReadError	8	/* seen a read error here recently */
 #define	R5_ReWrite	9	/* have tried to over-write the readerror */
-
 #define	R5_Expanded	10	/* This block now has post-expand data */
+#define	R5_Direct	11	/* Use the pages in bio to do the write directly. */
+
 /*
  * Write method
  */
Index: linux-2.6.18-128.1.14/mm/filemap.c
===================================================================
--- linux-2.6.18-128.1.14.orig/mm/filemap.c	2009-06-19 12:33:11.000000000 -0600
+++ linux-2.6.18-128.1.14/mm/filemap.c	2009-06-19 12:34:50.000000000 -0600
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
+#include <linux/rmap.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <trace/mm.h>
 #include "internal.h"
@@ -567,11 +568,55 @@
 		if (!test_clear_page_writeback(page))
 			BUG();
 	}
+	clear_page_constant(page);
 	smp_mb__after_clear_bit();
 	wake_up_page(page, PG_writeback);
 }
 EXPORT_SYMBOL(end_page_writeback);
 
+/* Make a page to be constant, `constant' means any write to this page will
+ * be blocked until clear_page_constant is called.
+ * The page lock must be held.
+ */
+int set_page_constant(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	/* If it's an anonymous page and haven't been added to swap cache,
+	 * return directly because we have no way to swap this page.
+	 */
+	if (page_mapping(page) == NULL)
+		return SWAP_FAIL;
+
+	BUG_ON(!PageUptodate(page));
+
+	/* I have to clear page uptodate before trying to remove
+	 * it from user's page table because otherwise, the page may be
+	 * reinstalled by a page access which happens between try_to_unmap()
+	 * and ClearPageUptodate(). -jay
+	 */
+	ClearPageUptodate(page);
+	if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) {
+		SetPageUptodate(page);
+		return SWAP_FAIL;
+	}
+	SetPageConstant(page);
+	return SWAP_SUCCESS;
+}
+
+void clear_page_constant(struct page *page)
+{
+	if (PageConstant(page)) {
+		BUG_ON(!PageLocked(page));
+		BUG_ON(PageUptodate(page));
+		ClearPageConstant(page);
+		SetPageUptodate(page);
+		unlock_page(page);
+	}
+}
+EXPORT_SYMBOL(set_page_constant);
+EXPORT_SYMBOL(clear_page_constant);
+
 /**
  * __lock_page - get a lock on the page, assuming we need to sleep to get it
  * @page: the page to lock
linux-patch-lustre 1.8.5+dfsg-3ubuntu1 / usr / src / kernel-patches / lustre / patches / raid5-zerocopy-rhel5.patch