/usr/src/kernel-patches/lustre/patches/raid6-zerocopy.patch is in linux-patch-lustre 1.8.5+dfsg-3ubuntu1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | diff -pur linux-2.6.9-67.orig/drivers/md/raid6main.c linux-2.6.9-67/drivers/md/raid6main.c
--- linux-2.6.9-67.orig/drivers/md/raid6main.c 2009-02-15 10:24:30.000000000 +0800
+++ linux-2.6.9-67/drivers/md/raid6main.c 2009-02-15 10:26:17.000000000 +0800
@@ -430,6 +430,9 @@ static int raid6_end_read_request (struc
clear_buffer_uptodate(bh);
}
#endif
+ /* Read on a Directing write is allowable */
+ /* BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)) */
+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page != sh->dev[i].page);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
@@ -468,6 +471,10 @@ static int raid6_end_write_request (stru
rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
+ if (test_bit(R5_Direct, &sh->dev[i].flags)) {
+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
+ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
+ }
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
__release_stripe(conf, sh);
@@ -664,7 +671,27 @@ static sector_t compute_blocknr(struct s
return r_sector;
}
+static struct page *zero_copy_data(struct bio *bio, sector_t sector)
+{
+ sector_t bi_sector = bio->bi_sector;
+ struct page *page = NULL;
+ struct bio_vec *bvl;
+ int i;
+ bio_for_each_segment(bvl, bio, i) {
+ if (sector == bi_sector)
+ page = bio_iovec_idx(bio, i)->bv_page;
+ bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
+ if (bi_sector >= sector + STRIPE_SECTORS) {
+ /* check if the stripe is covered by one page */
+ if (page == bio_iovec_idx(bio, i)->bv_page &&
+ PageConstant(page))
+ return page;
+ return NULL;
+ }
+ }
+ return NULL;
+}
/*
* Copy data between a page in the stripe cache, and one or more bion
@@ -731,6 +758,7 @@ static void compute_parity(struct stripe
raid6_conf_t *conf = sh->raid_conf;
int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
struct bio *chosen;
+ struct page *page;
/**** FIX THIS: This could be very bad if disks is close to 256 ****/
void *ptrs[disks];
@@ -761,18 +789,46 @@ static void compute_parity(struct stripe
BUG(); /* Not implemented yet */
}
- for (i = disks; i--;)
- if (sh->dev[i].written) {
- sector_t sector = sh->dev[i].sector;
- struct bio *wbi = sh->dev[i].written;
- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
- copy_data(1, wbi, sh->dev[i].page, sector);
- wbi = r5_next_bio(wbi, sector);
+ for (i = disks; i--;) {
+ struct bio *wbi = sh->dev[i].written;
+ sector_t sector;
+
+ if (!wbi)
+ continue;
+
+ sector = sh->dev[i].sector;
+ set_bit(R5_LOCKED, &sh->dev[i].flags);
+ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
+
+ /* check if it's covered by a single page
+ * and whole stripe is written at once.
+ * in this case we can avoid memcpy() */
+ if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
+ test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
+ page = zero_copy_data(wbi, sector);
+ /* we don't do zerocopy on a HighMem page. Raid6 tend
+ * to prepare all of the pages' content to be accessed
+ * before computing PQ parity. If we need to support HighMem
+ * page also, we have to modify the gen_syndrome()
+ * algorithm. -jay */
+ if (page && !PageHighMem(page)) {
+ atomic_inc(&conf->writes_zcopy);
+ sh->dev[i].req.bi_io_vec[0].bv_page = page;
+ set_bit(R5_Direct, &sh->dev[i].flags);
+ clear_bit(R5_UPTODATE, &sh->dev[i].flags);
+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
+ continue;
}
+ }
- set_bit(R5_LOCKED, &sh->dev[i].flags);
- set_bit(R5_UPTODATE, &sh->dev[i].flags);
+ atomic_inc(&conf->writes_copied);
+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
+ set_bit(R5_UPTODATE, &sh->dev[i].flags);
+ while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+ copy_data(1, wbi, sh->dev[i].page, sector);
+ wbi = r5_next_bio(wbi, sector);
}
+ }
// switch(method) {
// case RECONSTRUCT_WRITE:
@@ -783,7 +839,10 @@ static void compute_parity(struct stripe
count = 0;
i = d0_idx;
do {
- ptrs[count++] = page_address(sh->dev[i].page);
+ if (test_bit(R5_Direct, &sh->dev[i].flags))
+ ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
+ else
+ ptrs[count++] = page_address(sh->dev[i].page);
i = raid6_next_disk(i, disks);
} while ( i != d0_idx );
@@ -1185,7 +1244,8 @@ static void handle_stripe(struct stripe_
if (sh->dev[i].written) {
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) &&
- test_bit(R5_UPTODATE, &dev->flags) ) {
+ (test_bit(R5_UPTODATE, &dev->flags) ||
+ test_bit(R5_Direct, &dev->flags)) ) {
/* We can return any write requests */
struct bio *wbi, *wbi2;
PRINTK("Return write for stripe %llu disc %d\n",
@@ -1193,6 +1253,7 @@ static void handle_stripe(struct stripe_
spin_lock_irq(&conf->device_lock);
wbi = dev->written;
dev->written = NULL;
+ clear_bit(R5_Direct, &dev->flags);
while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
if (--wbi->bi_phys_segments == 0) {
@@ -1503,6 +1564,15 @@ static void handle_stripe(struct stripe_
} else {
PRINTK("skip op %ld on disc %d for sector %llu\n",
bi->bi_rw, i, (unsigned long long)sh->sector);
+
+ if (test_bit(R5_Direct, &sh->dev[i].flags)) {
+ /* restore the page pointer of req, otherwise,
+ * no any read is permitted on this stripe, this is
+ * not what we want. -jay */
+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
+ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
+ }
+
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
atomic_dec(&conf->delayed);
@@ -2008,6 +2078,7 @@ static int run (mddev_t *mddev)
if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
}
+ mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONST_WRITE;
/* Ok, everything is just fine now */
mddev->array_size = mddev->size * (mddev->raid_disks - 2);
@@ -2095,9 +2166,11 @@ static void status (struct seq_file *seq
atomic_read(&conf->handled_in_raid5d),
atomic_read(&conf->out_of_stripes),
atomic_read(&conf->handle_called));
- seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
+ seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
atomic_read(&conf->reads_for_rmw),
- atomic_read(&conf->reads_for_rcw));
+ atomic_read(&conf->reads_for_rcw),
+ atomic_read(&conf->writes_zcopy),
+ atomic_read(&conf->writes_copied));
seq_printf (seq, "\n\t\t%u delayed, %u active, queues: %u in, %u out\n",
atomic_read(&conf->delayed),
atomic_read(&conf->active_stripes),
|