Merge branch 'for-linus' of git://neil.brown.name/md

* 'for-linus' of git://neil.brown.name/md: (53 commits)
  md/raid5 revise rules for when to update metadata during reshape
  md/raid5: minor code cleanups in make_request.
  md: remove CONFIG_MD_RAID_RESHAPE config option.
  md/raid5: be more careful about write ordering when reshaping.
  md: don't display meaningless values in sysfs files resync_start and sync_speed
  md/raid5: allow layout and chunksize to be changed on active array.
  md/raid5: reshape using largest of old and new chunk size
  md/raid5: prepare for allowing reshape to change layout
  md/raid5: prepare for allowing reshape to change chunksize.
  md/raid5: clearly differentiate 'before' and 'after' stripes during reshape.
  Documentation/md.txt update
  md: allow number of drives in raid5 to be reduced
  md/raid5: change reshape-progress measurement to cope with reshaping backwards.
  md: add explicit method to signal the end of a reshape.
  md/raid5: enhance raid5_size to work correctly with negative delta_disks
  md/raid5: drop qd_idx from r6_state
  md/raid6: move raid6 data processing to raid6_pq.ko
  md: raid5 run(): Fix max_degraded for raid level 4.
  md: 'array_size' sysfs attribute
  md: centralize ->array_sectors modifications
  ...
This commit is contained in:
Linus Torvalds
2009-04-03 09:08:19 -07:00
當前提交 223cdea4c4
共有 39 個文件被更改,包括 2000 次插入858 次删除

查看文件

@@ -121,6 +121,7 @@ config MD_RAID10
config MD_RAID456
tristate "RAID-4/RAID-5/RAID-6 mode"
depends on BLK_DEV_MD
select MD_RAID6_PQ
select ASYNC_MEMCPY
select ASYNC_XOR
---help---
@@ -151,34 +152,8 @@ config MD_RAID456
If unsure, say Y.
config MD_RAID5_RESHAPE
bool "Support adding drives to a raid-5 array"
depends on MD_RAID456
default y
---help---
A RAID-5 set can be expanded by adding extra drives. This
requires "restriping" the array which means (almost) every
block must be written to a different place.
This option allows such restriping to be done while the array
is online.
You will need mdadm version 2.4.1 or later to use this
feature safely. During the early stage of reshape there is
a critical section where live data is being over-written. A
crash during this time needs extra care for recovery. The
newer mdadm takes a copy of the data in the critical section
and will restore it, if necessary, after a crash.
The mdadm usage is e.g.
mdadm --grow /dev/md1 --raid-disks=6
to grow '/dev/md1' to having 6 disks.
Note: The array can only be expanded, not contracted.
There should be enough spares already present to make the new
array workable.
If unsure, say Y.
config MD_RAID6_PQ
tristate
config MD_MULTIPATH
tristate "Multipath I/O support"

查看文件

@@ -2,20 +2,21 @@
# Makefile for the kernel software RAID and LVM drivers.
#
dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o
dm-multipath-objs := dm-path-selector.o dm-mpath.o
dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \
dm-multipath-y += dm-path-selector.o dm-mpath.o
dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
dm-snap-persistent.o
dm-mirror-objs := dm-raid1.o
md-mod-objs := md.o bitmap.o
raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \
dm-mirror-y += dm-raid1.o
md-mod-y += md.o bitmap.o
raid456-y += raid5.o
raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \
raid6int1.o raid6int2.o raid6int4.o \
raid6int8.o raid6int16.o raid6int32.o \
raid6altivec1.o raid6altivec2.o raid6altivec4.o \
raid6altivec8.o \
raid6mmx.o raid6sse1.o raid6sse2.o
hostprogs-y := mktables
hostprogs-y += mktables
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
@@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o
obj-$(CONFIG_MD_RAID0) += raid0.o
obj-$(CONFIG_MD_RAID1) += raid1.o
obj-$(CONFIG_MD_RAID10) += raid10.o
obj-$(CONFIG_MD_RAID6_PQ) += raid6_pq.o
obj-$(CONFIG_MD_RAID456) += raid456.o
obj-$(CONFIG_MD_MULTIPATH) += multipath.o
obj-$(CONFIG_MD_FAULTY) += faulty.o

查看文件

@@ -16,6 +16,7 @@
* wait if count gets too high, wake when it drops to half.
*/
#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
@@ -26,8 +27,8 @@
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/buffer_head.h>
#include <linux/raid/md.h>
#include <linux/raid/bitmap.h>
#include "md.h"
#include "bitmap.h"
/* debug macros */
@@ -111,9 +112,10 @@ static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int creat
unsigned char *mappage;
if (page >= bitmap->pages) {
printk(KERN_ALERT
"%s: invalid bitmap page request: %lu (> %lu)\n",
bmname(bitmap), page, bitmap->pages-1);
/* This can happen if bitmap_start_sync goes beyond
* End-of-device while looking for a whole page.
* It is harmless.
*/
return -EINVAL;
}
@@ -265,7 +267,6 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
list_for_each_continue_rcu(pos, &mddev->disks) {
rdev = list_entry(pos, mdk_rdev_t, same_set);
if (rdev->raid_disk >= 0 &&
test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags)) {
/* this is a usable devices */
atomic_inc(&rdev->nr_pending);
@@ -297,7 +298,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+ size/512 > 0)
/* bitmap runs in to metadata */
goto bad_alignment;
if (rdev->data_offset + mddev->size*2
if (rdev->data_offset + mddev->dev_sectors
> rdev->sb_start + bitmap->offset)
/* data runs in to bitmap */
goto bad_alignment;
@@ -570,7 +571,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
reason = "unrecognized superblock version";
else if (chunksize < PAGE_SIZE)
else if (chunksize < 512)
reason = "bitmap chunksize too small";
else if ((1 << ffz(~chunksize)) != chunksize)
reason = "bitmap chunksize not a power of 2";
@@ -1306,6 +1307,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
}
if (bitmap->mddev->degraded)
/* Never clear bits or update events_cleared when degraded */
success = 0;
while (sectors) {
int blocks;
@@ -1345,8 +1349,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
}
}
int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
int degraded)
static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
int degraded)
{
bitmap_counter_t *bmc;
int rv;
@@ -1374,6 +1378,29 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
return rv;
}
int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
int degraded)
{
/* bitmap_start_sync must always report on multiples of whole
* pages, otherwise resync (which is very PAGE_SIZE based) will
* get confused.
* So call __bitmap_start_sync repeatedly (if needed) until
* At least PAGE_SIZE>>9 blocks are covered.
* Return the 'or' of the result.
*/
int rv = 0;
int blocks1;
*blocks = 0;
while (*blocks < (PAGE_SIZE>>9)) {
rv |= __bitmap_start_sync(bitmap, offset,
&blocks1, degraded);
offset += blocks1;
*blocks += blocks1;
}
return rv;
}
void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted)
{
bitmap_counter_t *bmc;
@@ -1443,6 +1470,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
wait_event(bitmap->mddev->recovery_wait,
atomic_read(&bitmap->mddev->recovery_active) == 0);
bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync;
set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
s = 0;
while (s < sector && s < bitmap->mddev->resync_max_sectors) {

288
drivers/md/bitmap.h Normal file
查看文件

@@ -0,0 +1,288 @@
/*
* bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
*
* additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
*/
#ifndef BITMAP_H
#define BITMAP_H 1
#define BITMAP_MAJOR_LO 3
/* version 4 insists the bitmap is in little-endian order
* with version 3, it is host-endian which is non-portable
*/
#define BITMAP_MAJOR_HI 4
#define BITMAP_MAJOR_HOSTENDIAN 3
#define BITMAP_MINOR 39
/*
* in-memory bitmap:
*
* Use 16 bit block counters to track pending writes to each "chunk".
* The 2 high order bits are special-purpose, the first is a flag indicating
* whether a resync is needed. The second is a flag indicating whether a
* resync is active.
* This means that the counter is actually 14 bits:
*
* +--------+--------+------------------------------------------------+
* | resync | resync | counter |
* | needed | active | |
* | (0-1) | (0-1) | (0-16383) |
* +--------+--------+------------------------------------------------+
*
* The "resync needed" bit is set when:
* a '1' bit is read from storage at startup.
* a write request fails on some drives
* a resync is aborted on a chunk with 'resync active' set
* It is cleared (and resync-active set) when a resync starts across all drives
* of the chunk.
*
*
* The "resync active" bit is set when:
* a resync is started on all drives, and resync_needed is set.
* resync_needed will be cleared (as long as resync_active wasn't already set).
* It is cleared when a resync completes.
*
* The counter counts pending write requests, plus the on-disk bit.
* When the counter is '1' and the resync bits are clear, the on-disk
* bit can be cleared aswell, thus setting the counter to 0.
* When we set a bit, or in the counter (to start a write), if the fields is
* 0, we first set the disk bit and set the counter to 1.
*
* If the counter is 0, the on-disk bit is clear and the stipe is clean
* Anything that dirties the stipe pushes the counter to 2 (at least)
* and sets the on-disk bit (lazily).
* If a periodic sweep find the counter at 2, it is decremented to 1.
* If the sweep find the counter at 1, the on-disk bit is cleared and the
* counter goes to zero.
*
* Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
* counters as a fallback when "page" memory cannot be allocated:
*
* Normal case (page memory allocated):
*
* page pointer (32-bit)
*
* [ ] ------+
* |
* +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
* c1 c2 c2048
*
* Hijacked case (page memory allocation failed):
*
* hijacked page pointer (32-bit)
*
* [ ][ ] (no page memory allocated)
* counter #1 (16-bit) counter #2 (16-bit)
*
*/
#ifdef __KERNEL__
#define PAGE_BITS (PAGE_SIZE << 3)
#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
typedef __u16 bitmap_counter_t;
#define COUNTER_BITS 16
#define COUNTER_BIT_SHIFT 4
#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
/* how many counters per page? */
#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
/* same, except a shift value for more efficient bitops */
#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
/* same, except a mask value for more efficient bitops */
#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
#define BITMAP_BLOCK_SIZE 512
#define BITMAP_BLOCK_SHIFT 9
/* how many blocks per chunk? (this is variable) */
#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
/* when hijacked, the counters and bits represent even larger "chunks" */
/* there will be 1024 chunks represented by each counter in the page pointers */
#define PAGEPTR_BLOCK_RATIO(bitmap) \
(CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
#define PAGEPTR_BLOCK_SHIFT(bitmap) \
(CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
/*
* on-disk bitmap:
*
* Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
* file a page at a time. There's a superblock at the start of the file.
*/
/* map chunks (bits) to file pages - offset by the size of the superblock */
#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
#endif
/*
* bitmap structures:
*/
#define BITMAP_MAGIC 0x6d746962
/* use these for bitmap->flags and bitmap->sb->state bit-fields */
enum bitmap_state {
BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */
BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
BITMAP_HOSTENDIAN = 0x8000,
};
/* the superblock at the front of the bitmap file -- little endian */
typedef struct bitmap_super_s {
__le32 magic; /* 0 BITMAP_MAGIC */
__le32 version; /* 4 the bitmap major for now, could change... */
__u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */
__le64 events; /* 24 event counter for the bitmap (1)*/
__le64 events_cleared;/*32 event counter when last bit cleared (2) */
__le64 sync_size; /* 40 the size of the md device's sync range(3) */
__le32 state; /* 48 bitmap state information */
__le32 chunksize; /* 52 the bitmap chunk size in bytes */
__le32 daemon_sleep; /* 56 seconds between disk flushes */
__le32 write_behind; /* 60 number of outstanding write-behind writes */
__u8 pad[256 - 64]; /* set to zero */
} bitmap_super_t;
/* notes:
* (1) This event counter is updated before the eventcounter in the md superblock
* When a bitmap is loaded, it is only accepted if this event counter is equal
* to, or one greater than, the event counter in the superblock.
* (2) This event counter is updated when the other one is *if*and*only*if* the
* array is not degraded. As bits are not cleared when the array is degraded,
* this represents the last time that any bits were cleared.
* If a device is being added that has an event count with this value or
* higher, it is accepted as conforming to the bitmap.
* (3)This is the number of sectors represented by the bitmap, and is the range that
* resync happens across. For raid1 and raid5/6 it is the size of individual
* devices. For raid10 it is the size of the array.
*/
#ifdef __KERNEL__
/* the in-memory bitmap is represented by bitmap_pages */
struct bitmap_page {
/*
* map points to the actual memory page
*/
char *map;
/*
* in emergencies (when map cannot be alloced), hijack the map
* pointer and use it as two counters itself
*/
unsigned int hijacked:1;
/*
* count of dirty bits on the page
*/
unsigned int count:31;
};
/* keep track of bitmap file pages that have pending writes on them */
struct page_list {
struct list_head list;
struct page *page;
};
/* the main bitmap structure - one per mddev */
struct bitmap {
struct bitmap_page *bp;
unsigned long pages; /* total number of pages in the bitmap */
unsigned long missing_pages; /* number of pages not yet allocated */
mddev_t *mddev; /* the md device that the bitmap is for */
int counter_bits; /* how many bits per block counter */
/* bitmap chunksize -- how much data does each bit represent? */
unsigned long chunksize;
unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
unsigned long chunks; /* total number of data chunks for the array */
/* We hold a count on the chunk currently being synced, and drop
* it when the last block is started. If the resync is aborted
* midway, we need to be able to drop that count, so we remember
* the counted chunk..
*/
unsigned long syncchunk;
__u64 events_cleared;
int need_sync;
/* bitmap spinlock */
spinlock_t lock;
long offset; /* offset from superblock if file is NULL */
struct file *file; /* backing disk file */
struct page *sb_page; /* cached copy of the bitmap file superblock */
struct page **filemap; /* list of cache pages for the file */
unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
unsigned long file_pages; /* number of pages in the file */
int last_page_size; /* bytes in the last page */
unsigned long flags;
int allclean;
unsigned long max_write_behind; /* write-behind mode */
atomic_t behind_writes;
/*
* the bitmap daemon - periodically wakes up and sweeps the bitmap
* file, cleaning up bits and flushing out pages to disk as necessary
*/
unsigned long daemon_lastrun; /* jiffies of last run */
unsigned long daemon_sleep; /* how many seconds between updates? */
unsigned long last_end_sync; /* when we lasted called end_sync to
* update bitmap with resync progress */
atomic_t pending_writes; /* pending writes to the bitmap file */
wait_queue_head_t write_wait;
wait_queue_head_t overflow_wait;
};
/* the bitmap API */
/* these are used only by md/bitmap */
int bitmap_create(mddev_t *mddev);
void bitmap_flush(mddev_t *mddev);
void bitmap_destroy(mddev_t *mddev);
void bitmap_print_sb(struct bitmap *bitmap);
void bitmap_update_sb(struct bitmap *bitmap);
int bitmap_setallbits(struct bitmap *bitmap);
void bitmap_write_all(struct bitmap *bitmap);
void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
/* these are exported */
int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
unsigned long sectors, int behind);
void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
unsigned long sectors, int success, int behind);
int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
void bitmap_close_sync(struct bitmap *bitmap);
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
void bitmap_unplug(struct bitmap *bitmap);
void bitmap_daemon_work(struct bitmap *bitmap);
#endif
#endif

查看文件

@@ -62,7 +62,10 @@
#define ModeShift 5
#define MaxFault 50
#include <linux/raid/md.h>
#include <linux/blkdev.h>
#include <linux/raid/md_u.h>
#include "md.h"
#include <linux/seq_file.h>
static void faulty_fail(struct bio *bio, int error)
@@ -280,6 +283,17 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size)
return 0;
}
static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
WARN_ONCE(raid_disks,
"%s does not support generic reshape\n", __func__);
if (sectors == 0)
return mddev->dev_sectors;
return sectors;
}
static int run(mddev_t *mddev)
{
mdk_rdev_t *rdev;
@@ -298,7 +312,7 @@ static int run(mddev_t *mddev)
list_for_each_entry(rdev, &mddev->disks, same_set)
conf->rdev = rdev;
mddev->array_sectors = mddev->size * 2;
md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
mddev->private = conf;
reconfig(mddev, mddev->layout, -1);
@@ -325,6 +339,7 @@ static struct mdk_personality faulty_personality =
.stop = stop,
.status = status,
.reconfig = reconfig,
.size = faulty_size,
};
static int __init raid_init(void)

查看文件

@@ -16,7 +16,11 @@
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/raid/linear.h>
#include <linux/blkdev.h>
#include <linux/raid/md_u.h>
#include <linux/seq_file.h>
#include "md.h"
#include "linear.h"
/*
* find which device holds a particular offset
@@ -97,6 +101,16 @@ static int linear_congested(void *data, int bits)
return ret;
}
static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
linear_conf_t *conf = mddev_to_conf(mddev);
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
return conf->array_sectors;
}
static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
{
linear_conf_t *conf;
@@ -135,8 +149,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
mddev->queue->max_sectors > (PAGE_SIZE>>9))
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
disk->num_sectors = rdev->size * 2;
conf->array_sectors += rdev->size * 2;
disk->num_sectors = rdev->sectors;
conf->array_sectors += rdev->sectors;
cnt++;
}
@@ -249,7 +263,7 @@ static int linear_run (mddev_t *mddev)
if (!conf)
return 1;
mddev->private = conf;
mddev->array_sectors = conf->array_sectors;
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
mddev->queue->unplug_fn = linear_unplug;
@@ -283,7 +297,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
newconf->prev = mddev_to_conf(mddev);
mddev->private = newconf;
mddev->raid_disks++;
mddev->array_sectors = newconf->array_sectors;
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
return 0;
}
@@ -381,6 +395,7 @@ static struct mdk_personality linear_personality =
.stop = linear_stop,
.status = linear_status,
.hot_add_disk = linear_add,
.size = linear_size,
};
static int __init linear_init (void)

29
drivers/md/linear.h Normal file
查看文件

@@ -0,0 +1,29 @@
#ifndef _LINEAR_H
#define _LINEAR_H
struct dev_info {
mdk_rdev_t *rdev;
sector_t num_sectors;
sector_t start_sector;
};
typedef struct dev_info dev_info_t;
struct linear_private_data
{
struct linear_private_data *prev; /* earlier version */
dev_info_t **hash_table;
sector_t spacing;
sector_t array_sectors;
int sector_shift; /* shift before dividing
* by spacing
*/
dev_info_t disks[0];
};
typedef struct linear_private_data linear_conf_t;
#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
#endif

文件差異過大導致無法顯示 Load Diff

436
drivers/md/md.h Normal file
查看文件

@@ -0,0 +1,436 @@
/*
md_k.h : kernel internal structure of the Linux MD driver
Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
You should have received a copy of the GNU General Public License
(for example /usr/src/linux/COPYING); if not, write to the Free
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _MD_K_H
#define _MD_K_H
#ifdef CONFIG_BLOCK
#define MaxSector (~(sector_t)0)
typedef struct mddev_s mddev_t;
typedef struct mdk_rdev_s mdk_rdev_t;
/*
* options passed in raidrun:
*/
/* Currently this must fit in an 'int' */
#define MAX_CHUNK_SIZE (1<<30)
/*
* MD's 'extended' device
*/
struct mdk_rdev_s
{
struct list_head same_set; /* RAID devices within the same set */
sector_t sectors; /* Device size (in 512bytes sectors) */
mddev_t *mddev; /* RAID array if running */
int last_events; /* IO event timestamp */
struct block_device *bdev; /* block device handle */
struct page *sb_page;
int sb_loaded;
__u64 sb_events;
sector_t data_offset; /* start of data in array */
sector_t sb_start; /* offset of the super block (in 512byte sectors) */
int sb_size; /* bytes in the superblock */
int preferred_minor; /* autorun support */
struct kobject kobj;
/* A device can be in one of three states based on two flags:
* Not working: faulty==1 in_sync==0
* Fully working: faulty==0 in_sync==1
* Working, but not
* in sync with array
* faulty==0 in_sync==0
*
* It can never have faulty==1, in_sync==1
* This reduces the burden of testing multiple flags in many cases
*/
unsigned long flags;
#define Faulty 1 /* device is known to have a fault */
#define In_sync 2 /* device is in_sync with rest of array */
#define WriteMostly 4 /* Avoid reading if at all possible */
#define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */
#define AllReserved 6 /* If whole device is reserved for
* one array */
#define AutoDetected 7 /* added by auto-detect */
#define Blocked 8 /* An error occured on an externally
* managed array, don't allow writes
* until it is cleared */
#define StateChanged 9 /* Faulty or Blocked has changed during
* interrupt, so it needs to be
* notified by the thread */
wait_queue_head_t blocked_wait;
int desc_nr; /* descriptor index in the superblock */
int raid_disk; /* role of device in array */
int saved_raid_disk; /* role that device used to have in the
* array and could again if we did a partial
* resync from the bitmap
*/
sector_t recovery_offset;/* If this device has been partially
* recovered, this is where we were
* up to.
*/
atomic_t nr_pending; /* number of pending requests.
* only maintained for arrays that
* support hot removal
*/
atomic_t read_errors; /* number of consecutive read errors that
* we have tried to ignore.
*/
atomic_t corrected_errors; /* number of corrected read errors,
* for reporting to userspace and storing
* in superblock.
*/
struct work_struct del_work; /* used for delayed sysfs removal */
struct sysfs_dirent *sysfs_state; /* handle for 'state'
* sysfs entry */
};
struct mddev_s
{
void *private;
struct mdk_personality *pers;
dev_t unit;
int md_minor;
struct list_head disks;
unsigned long flags;
#define MD_CHANGE_DEVS 0 /* Some device status has changed */
#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */
#define MD_CHANGE_PENDING 2 /* superblock update in progress */
int suspended;
atomic_t active_io;
int ro;
struct gendisk *gendisk;
struct kobject kobj;
int hold_active;
#define UNTIL_IOCTL 1
#define UNTIL_STOP 2
/* Superblock information */
int major_version,
minor_version,
patch_version;
int persistent;
int external; /* metadata is
* managed externally */
char metadata_type[17]; /* externally set*/
int chunk_size;
time_t ctime, utime;
int level, layout;
char clevel[16];
int raid_disks;
int max_disks;
sector_t dev_sectors; /* used size of
* component devices */
sector_t array_sectors; /* exported array size */
int external_size; /* size managed
* externally */
__u64 events;
char uuid[16];
/* If the array is being reshaped, we need to record the
* new shape and an indication of where we are up to.
* This is written to the superblock.
* If reshape_position is MaxSector, then no reshape is happening (yet).
*/
sector_t reshape_position;
int delta_disks, new_level, new_layout, new_chunk;
struct mdk_thread_s *thread; /* management thread */
struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
sector_t curr_resync; /* last block scheduled */
/* As resync requests can complete out of order, we cannot easily track
* how much resync has been completed. So we occasionally pause until
* everything completes, then set curr_resync_completed to curr_resync.
* As such it may be well behind the real resync mark, but it is a value
* we are certain of.
*/
sector_t curr_resync_completed;
unsigned long resync_mark; /* a recent timestamp */
sector_t resync_mark_cnt;/* blocks written at resync_mark */
sector_t curr_mark_cnt; /* blocks scheduled now */
sector_t resync_max_sectors; /* may be set by personality */
sector_t resync_mismatches; /* count of sectors where
* parity/replica mismatch found
*/
/* allow user-space to request suspension of IO to regions of the array */
sector_t suspend_lo;
sector_t suspend_hi;
/* if zero, use the system-wide default */
int sync_speed_min;
int sync_speed_max;
/* resync even though the same disks are shared among md-devices */
int parallel_resync;
int ok_start_degraded;
/* recovery/resync flags
* NEEDED: we might need to start a resync/recover
* RUNNING: a thread is running, or about to be started
* SYNC: actually doing a resync, not a recovery
* RECOVER: doing recovery, or need to try it.
* INTR: resync needs to be aborted for some reason
* DONE: thread is done and is waiting to be reaped
* REQUEST: user-space has requested a sync (used with SYNC)
* CHECK: user-space request for for check-only, no repair
* RESHAPE: A reshape is happening
*
* If neither SYNC or RESHAPE are set, then it is a recovery.
*/
#define MD_RECOVERY_RUNNING 0
#define MD_RECOVERY_SYNC 1
#define MD_RECOVERY_RECOVER 2
#define MD_RECOVERY_INTR 3
#define MD_RECOVERY_DONE 4
#define MD_RECOVERY_NEEDED 5
#define MD_RECOVERY_REQUESTED 6
#define MD_RECOVERY_CHECK 7
#define MD_RECOVERY_RESHAPE 8
#define MD_RECOVERY_FROZEN 9
unsigned long recovery;
int recovery_disabled; /* if we detect that recovery
* will always fail, set this
* so we don't loop trying */
int in_sync; /* know to not need resync */
struct mutex reconfig_mutex;
atomic_t active; /* general refcount */
atomic_t openers; /* number of active opens */
int changed; /* true if we might need to reread partition info */
int degraded; /* whether md should consider
* adding a spare
*/
int barriers_work; /* initialised to true, cleared as soon
* as a barrier request to slave
* fails. Only supported
*/
struct bio *biolist; /* bios that need to be retried
* because BIO_RW_BARRIER is not supported
*/
atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
sector_t recovery_cp;
sector_t resync_min; /* user requested sync
* starts here */
sector_t resync_max; /* resync should pause
* when it gets here */
struct sysfs_dirent *sysfs_state; /* handle for 'array_state'
* file in sysfs.
*/
struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */
struct work_struct del_work; /* used for delayed sysfs removal */
spinlock_t write_lock;
wait_queue_head_t sb_wait; /* for waiting on superblock updates */
atomic_t pending_writes; /* number of active superblock writes */
unsigned int safemode; /* if set, update "clean" superblock
* when no writes pending.
*/
unsigned int safemode_delay;
struct timer_list safemode_timer;
atomic_t writes_pending;
struct request_queue *queue; /* for plugging ... */
atomic_t write_behind; /* outstanding async IO */
unsigned int max_write_behind; /* 0 = sync */
struct bitmap *bitmap; /* the bitmap for the device */
struct file *bitmap_file; /* the bitmap file */
long bitmap_offset; /* offset from superblock of
* start of bitmap. May be
* negative, but not '0'
*/
long default_bitmap_offset; /* this is the offset to use when
* hot-adding a bitmap. It should
* eventually be settable by sysfs.
*/
struct list_head all_mddevs;
};
static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
{
int faulty = test_bit(Faulty, &rdev->flags);
if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
{
atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
}
struct mdk_personality
{
char *name;
int level;
struct list_head list;
struct module *owner;
int (*make_request)(struct request_queue *q, struct bio *bio);
int (*run)(mddev_t *mddev);
int (*stop)(mddev_t *mddev);
void (*status)(struct seq_file *seq, mddev_t *mddev);
/* error_handler must set ->faulty and clear ->in_sync
* if appropriate, and should abort recovery if needed
*/
void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
int (*hot_remove_disk) (mddev_t *mddev, int number);
int (*spare_active) (mddev_t *mddev);
sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
int (*resize) (mddev_t *mddev, sector_t sectors);
sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks);
int (*check_reshape) (mddev_t *mddev);
int (*start_reshape) (mddev_t *mddev);
void (*finish_reshape) (mddev_t *mddev);
int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
/* quiesce moves between quiescence states
* 0 - fully active
* 1 - no new requests allowed
* others - reserved
*/
void (*quiesce) (mddev_t *mddev, int state);
/* takeover is used to transition an array from one
* personality to another. The new personality must be able
* to handle the data in the current layout.
* e.g. 2drive raid1 -> 2drive raid5
* ndrive raid5 -> degraded n+1drive raid6 with special layout
* If the takeover succeeds, a new 'private' structure is returned.
* This needs to be installed and then ->run used to activate the
* array.
*/
void *(*takeover) (mddev_t *mddev);
};
struct md_sysfs_entry {
struct attribute attr;
ssize_t (*show)(mddev_t *, char *);
ssize_t (*store)(mddev_t *, const char *, size_t);
};
static inline char * mdname (mddev_t * mddev)
{
return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
}
/*
* iterates through some rdev ringlist. It's safe to remove the
* current 'rdev'. Dont touch 'tmp' though.
*/
#define rdev_for_each_list(rdev, tmp, head) \
list_for_each_entry_safe(rdev, tmp, head, same_set)
/*
* iterates through the 'same array disks' ringlist
*/
#define rdev_for_each(rdev, tmp, mddev) \
list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
#define rdev_for_each_rcu(rdev, mddev) \
list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
typedef struct mdk_thread_s {
void (*run) (mddev_t *mddev);
mddev_t *mddev;
wait_queue_head_t wqueue;
unsigned long flags;
struct task_struct *tsk;
unsigned long timeout;
} mdk_thread_t;
#define THREAD_WAKEUP 0
#define __wait_event_lock_irq(wq, condition, lock, cmd) \
do { \
wait_queue_t __wait; \
init_waitqueue_entry(&__wait, current); \
\
add_wait_queue(&wq, &__wait); \
for (;;) { \
set_current_state(TASK_UNINTERRUPTIBLE); \
if (condition) \
break; \
spin_unlock_irq(&lock); \
cmd; \
schedule(); \
spin_lock_irq(&lock); \
} \
current->state = TASK_RUNNING; \
remove_wait_queue(&wq, &__wait); \
} while (0)
#define wait_event_lock_irq(wq, condition, lock, cmd) \
do { \
if (condition) \
break; \
__wait_event_lock_irq(wq, condition, lock, cmd); \
} while (0)
static inline void safe_put_page(struct page *p)
{
if (p) put_page(p);
}
#endif /* CONFIG_BLOCK */
#endif
extern int register_md_personality(struct mdk_personality *p);
extern int unregister_md_personality(struct mdk_personality *p);
extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
mddev_t *mddev, const char *name);
extern void md_unregister_thread(mdk_thread_t *thread);
extern void md_wakeup_thread(mdk_thread_t *thread);
extern void md_check_recovery(mddev_t *mddev);
extern void md_write_start(mddev_t *mddev, struct bio *bi);
extern void md_write_end(mddev_t *mddev);
extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page);
extern void md_super_wait(mddev_t *mddev);
extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
struct page *page, int rw);
extern void md_do_sync(mddev_t *mddev);
extern void md_new_event(mddev_t *mddev);
extern int md_allow_write(mddev_t *mddev);
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);

查看文件

@@ -59,7 +59,7 @@ int main(int argc, char *argv[])
uint8_t v;
uint8_t exptbl[256], invtbl[256];
printf("#include \"raid6.h\"\n");
printf("#include <linux/raid/pq.h>\n");
/* Compute multiplication table */
printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -76,6 +76,9 @@ int main(int argc, char *argv[])
printf("\t},\n");
}
printf("};\n");
printf("#ifdef __KERNEL__\n");
printf("EXPORT_SYMBOL(raid6_gfmul);\n");
printf("#endif\n");
/* Compute power-of-2 table (exponent) */
v = 1;
@@ -92,6 +95,9 @@ int main(int argc, char *argv[])
}
}
printf("};\n");
printf("#ifdef __KERNEL__\n");
printf("EXPORT_SYMBOL(raid6_gfexp);\n");
printf("#endif\n");
/* Compute inverse table x^-1 == x^254 */
printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -104,6 +110,9 @@ int main(int argc, char *argv[])
}
}
printf("};\n");
printf("#ifdef __KERNEL__\n");
printf("EXPORT_SYMBOL(raid6_gfinv);\n");
printf("#endif\n");
/* Compute inv(2^x + 1) (exponent-xor-inverse) table */
printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -115,6 +124,9 @@ int main(int argc, char *argv[])
(j == 7) ? '\n' : ' ');
}
printf("};\n");
printf("#ifdef __KERNEL__\n");
printf("EXPORT_SYMBOL(raid6_gfexi);\n");
printf("#endif\n");
return 0;
}

查看文件

@@ -19,7 +19,11 @@
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/raid/multipath.h>
#include <linux/blkdev.h>
#include <linux/raid/md_u.h>
#include <linux/seq_file.h>
#include "md.h"
#include "multipath.h"
#define MAX_WORK_PER_DISK 128
@@ -402,6 +406,14 @@ static void multipathd (mddev_t *mddev)
spin_unlock_irqrestore(&conf->device_lock, flags);
}
static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
return mddev->dev_sectors;
}
static int multipath_run (mddev_t *mddev)
{
multipath_conf_t *conf;
@@ -498,7 +510,7 @@ static int multipath_run (mddev_t *mddev)
/*
* Ok, everything is just fine now
*/
mddev->array_sectors = mddev->size * 2;
md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
mddev->queue->unplug_fn = multipath_unplug;
mddev->queue->backing_dev_info.congested_fn = multipath_congested;
@@ -543,6 +555,7 @@ static struct mdk_personality multipath_personality =
.error_handler = multipath_error,
.hot_add_disk = multipath_add_disk,
.hot_remove_disk= multipath_remove_disk,
.size = multipath_size,
};
static int __init multipath_init (void)

40
drivers/md/multipath.h Normal file
查看文件

@@ -0,0 +1,40 @@
#ifndef _MULTIPATH_H
#define _MULTIPATH_H
struct multipath_info {
mdk_rdev_t *rdev;
};
struct multipath_private_data {
mddev_t *mddev;
struct multipath_info *multipaths;
int raid_disks;
int working_disks;
spinlock_t device_lock;
struct list_head retry_list;
mempool_t *pool;
};
typedef struct multipath_private_data multipath_conf_t;
/*
* this is the only point in the RAID code where we violate
* C type safety. mddev->private is an 'opaque' pointer.
*/
#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private)
/*
* this is our 'private' 'collective' MULTIPATH buffer head.
* it contains information about what kind of IO operations were started
* for this MULTIPATH operation, and about their status:
*/
struct multipath_bh {
mddev_t *mddev;
struct bio *master_bio;
struct bio bio;
int path;
struct list_head retry_list;
};
#endif

查看文件

@@ -18,7 +18,10 @@
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/raid/raid0.h>
#include <linux/blkdev.h>
#include <linux/seq_file.h>
#include "md.h"
#include "raid0.h"
static void raid0_unplug(struct request_queue *q)
{
@@ -73,16 +76,15 @@ static int create_strip_zones (mddev_t *mddev)
list_for_each_entry(rdev2, &mddev->disks, same_set) {
printk(KERN_INFO "raid0: comparing %s(%llu)",
bdevname(rdev1->bdev,b),
(unsigned long long)rdev1->size);
(unsigned long long)rdev1->sectors);
printk(KERN_INFO " with %s(%llu)\n",
bdevname(rdev2->bdev,b),
(unsigned long long)rdev2->size);
(unsigned long long)rdev2->sectors);
if (rdev2 == rdev1) {
printk(KERN_INFO "raid0: END\n");
break;
}
if (rdev2->size == rdev1->size)
{
if (rdev2->sectors == rdev1->sectors) {
/*
* Not unique, don't count it as a new
* group
@@ -145,7 +147,7 @@ static int create_strip_zones (mddev_t *mddev)
mddev->queue->max_sectors > (PAGE_SIZE>>9))
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
if (!smallest || (rdev1->size <smallest->size))
if (!smallest || (rdev1->sectors < smallest->sectors))
smallest = rdev1;
cnt++;
}
@@ -155,10 +157,10 @@ static int create_strip_zones (mddev_t *mddev)
goto abort;
}
zone->nb_dev = cnt;
zone->sectors = smallest->size * cnt * 2;
zone->sectors = smallest->sectors * cnt;
zone->zone_start = 0;
current_start = smallest->size * 2;
current_start = smallest->sectors;
curr_zone_start = zone->sectors;
/* now do the other zones */
@@ -177,29 +179,29 @@ static int create_strip_zones (mddev_t *mddev)
rdev = conf->strip_zone[0].dev[j];
printk(KERN_INFO "raid0: checking %s ...",
bdevname(rdev->bdev, b));
if (rdev->size > current_start / 2) {
printk(KERN_INFO " contained as device %d\n",
c);
zone->dev[c] = rdev;
c++;
if (!smallest || (rdev->size <smallest->size)) {
smallest = rdev;
printk(KERN_INFO " (%llu) is smallest!.\n",
(unsigned long long)rdev->size);
}
} else
if (rdev->sectors <= current_start) {
printk(KERN_INFO " nope.\n");
continue;
}
printk(KERN_INFO " contained as device %d\n", c);
zone->dev[c] = rdev;
c++;
if (!smallest || rdev->sectors < smallest->sectors) {
smallest = rdev;
printk(KERN_INFO " (%llu) is smallest!.\n",
(unsigned long long)rdev->sectors);
}
}
zone->nb_dev = c;
zone->sectors = (smallest->size * 2 - current_start) * c;
zone->sectors = (smallest->sectors - current_start) * c;
printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
zone->nb_dev, (unsigned long long)zone->sectors);
zone->zone_start = curr_zone_start;
curr_zone_start += zone->sectors;
current_start = smallest->size * 2;
current_start = smallest->sectors;
printk(KERN_INFO "raid0: current zone start: %llu\n",
(unsigned long long)current_start);
}
@@ -261,12 +263,25 @@ static int raid0_mergeable_bvec(struct request_queue *q,
return max;
}
static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
sector_t array_sectors = 0;
mdk_rdev_t *rdev;
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
list_for_each_entry(rdev, &mddev->disks, same_set)
array_sectors += rdev->sectors;
return array_sectors;
}
static int raid0_run (mddev_t *mddev)
{
unsigned cur=0, i=0, nb_zone;
s64 sectors;
raid0_conf_t *conf;
mdk_rdev_t *rdev;
if (mddev->chunk_size == 0) {
printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
@@ -291,16 +306,14 @@ static int raid0_run (mddev_t *mddev)
goto out_free_conf;
/* calculate array device size */
mddev->array_sectors = 0;
list_for_each_entry(rdev, &mddev->disks, same_set)
mddev->array_sectors += rdev->size * 2;
md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
(unsigned long long)mddev->array_sectors);
printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
(unsigned long long)conf->spacing);
{
sector_t s = mddev->array_sectors;
sector_t s = raid0_size(mddev, 0, 0);
sector_t space = conf->spacing;
int round;
conf->sector_shift = 0;
@@ -509,6 +522,7 @@ static struct mdk_personality raid0_personality=
.run = raid0_run,
.stop = raid0_stop,
.status = raid0_status,
.size = raid0_size,
};
static int __init raid0_init (void)

28
drivers/md/raid0.h Normal file
查看文件

@@ -0,0 +1,28 @@
#ifndef _RAID0_H
#define _RAID0_H
struct strip_zone
{
sector_t zone_start; /* Zone offset in md_dev (in sectors) */
sector_t dev_start; /* Zone offset in real dev (in sectors) */
sector_t sectors; /* Zone size in sectors */
int nb_dev; /* # of devices attached to the zone */
mdk_rdev_t **dev; /* Devices attached to the zone */
};
struct raid0_private_data
{
struct strip_zone **hash_table; /* Table of indexes into strip_zone */
struct strip_zone *strip_zone;
mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
int nr_strip_zones;
sector_t spacing;
int sector_shift; /* shift this before divide by spacing */
};
typedef struct raid0_private_data raid0_conf_t;
#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
#endif

查看文件

@@ -31,10 +31,13 @@
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "dm-bio-list.h"
#include <linux/delay.h>
#include <linux/raid/raid1.h>
#include <linux/raid/bitmap.h>
#include <linux/blkdev.h>
#include <linux/seq_file.h>
#include "md.h"
#include "dm-bio-list.h"
#include "raid1.h"
#include "bitmap.h"
#define DEBUG 0
#if DEBUG
@@ -1723,7 +1726,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return 0;
}
max_sector = mddev->size << 1;
max_sector = mddev->dev_sectors;
if (sector_nr >= max_sector) {
/* If we aborted, we need to abort the
* sync on the 'current' bitmap chunk (there will
@@ -1919,6 +1922,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return nr_sectors;
}
static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
if (sectors)
return sectors;
return mddev->dev_sectors;
}
static int run(mddev_t *mddev)
{
conf_t *conf;
@@ -2048,7 +2059,7 @@ static int run(mddev_t *mddev)
/*
* Ok, everything is just fine now
*/
mddev->array_sectors = mddev->size * 2;
md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
mddev->queue->unplug_fn = raid1_unplug;
mddev->queue->backing_dev_info.congested_fn = raid1_congested;
@@ -2089,6 +2100,9 @@ static int stop(mddev_t *mddev)
/* need to kick something here to make sure I/O goes? */
}
raise_barrier(conf);
lower_barrier(conf);
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
@@ -2110,15 +2124,17 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
* any io in the removed space completes, but it hardly seems
* worth it.
*/
mddev->array_sectors = sectors;
md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
if (mddev->array_sectors / 2 > mddev->size &&
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->size << 1;
mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
mddev->size = mddev->array_sectors / 2;
mddev->dev_sectors = sectors;
mddev->resync_max_sectors = sectors;
return 0;
}
@@ -2264,6 +2280,7 @@ static struct mdk_personality raid1_personality =
.spare_active = raid1_spare_active,
.sync_request = sync_request,
.resize = raid1_resize,
.size = raid1_size,
.check_reshape = raid1_reshape,
.quiesce = raid1_quiesce,
};

132
drivers/md/raid1.h Normal file
查看文件

@@ -0,0 +1,132 @@
#ifndef _RAID1_H
#define _RAID1_H
typedef struct mirror_info mirror_info_t;
struct mirror_info {
mdk_rdev_t *rdev;
sector_t head_position;
};
/*
* memory pools need a pointer to the mddev, so they can force an unplug
* when memory is tight, and a count of the number of drives that the
* pool was allocated for, so they know how much to allocate and free.
* mddev->raid_disks cannot be used, as it can change while a pool is active
* These two datums are stored in a kmalloced struct.
*/
struct pool_info {
mddev_t *mddev;
int raid_disks;
};
typedef struct r1bio_s r1bio_t;
struct r1_private_data_s {
mddev_t *mddev;
mirror_info_t *mirrors;
int raid_disks;
int last_used;
sector_t next_seq_sect;
spinlock_t device_lock;
struct list_head retry_list;
/* queue pending writes and submit them on unplug */
struct bio_list pending_bio_list;
/* queue of writes that have been unplugged */
struct bio_list flushing_bio_list;
/* for use when syncing mirrors: */
spinlock_t resync_lock;
int nr_pending;
int nr_waiting;
int nr_queued;
int barrier;
sector_t next_resync;
int fullsync; /* set to 1 if a full sync is needed,
* (fresh device added).
* Cleared when a sync completes.
*/
wait_queue_head_t wait_barrier;
struct pool_info *poolinfo;
struct page *tmppage;
mempool_t *r1bio_pool;
mempool_t *r1buf_pool;
};
typedef struct r1_private_data_s conf_t;
/*
* this is the only point in the RAID code where we violate
* C type safety. mddev->private is an 'opaque' pointer.
*/
#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
/*
* this is our 'private' RAID1 bio.
*
* it contains information about what kind of IO operations were started
* for this RAID1 operation, and about their status:
*/
struct r1bio_s {
atomic_t remaining; /* 'have we finished' count,
* used from IRQ handlers
*/
atomic_t behind_remaining; /* number of write-behind ios remaining
* in this BehindIO request
*/
sector_t sector;
int sectors;
unsigned long state;
mddev_t *mddev;
/*
* original bio going to /dev/mdx
*/
struct bio *master_bio;
/*
* if the IO is in READ direction, then this is where we read
*/
int read_disk;
struct list_head retry_list;
struct bitmap_update *bitmap_update;
/*
* if the IO is in WRITE direction, then multiple bios are used.
* We choose the number when they are allocated.
*/
struct bio *bios[0];
/* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
};
/* when we get a read error on a read-only array, we redirect to another
* device without failing the first device, or trying to over-write to
* correct the read error. To keep track of bad blocks on a per-bio
* level, we store IO_BLOCKED in the appropriate 'bios' pointer
*/
#define IO_BLOCKED ((struct bio*)1)
/* bits for r1bio.state */
#define R1BIO_Uptodate 0
#define R1BIO_IsSync 1
#define R1BIO_Degraded 2
#define R1BIO_BehindIO 3
#define R1BIO_Barrier 4
#define R1BIO_BarrierRetry 5
/* For write-behind requests, we call bi_end_io when
* the last non-write-behind device completes, providing
* any write was successful. Otherwise we call when
* any write-behind write succeeds, otherwise we call
* with failure when last write completes (and all failed).
* Record that bi_end_io was called with this flag...
*/
#define R1BIO_Returned 6
#endif

查看文件

@@ -18,10 +18,13 @@
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "dm-bio-list.h"
#include <linux/delay.h>
#include <linux/raid/raid10.h>
#include <linux/raid/bitmap.h>
#include <linux/blkdev.h>
#include <linux/seq_file.h>
#include "md.h"
#include "dm-bio-list.h"
#include "raid10.h"
#include "bitmap.h"
/*
* RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -1695,7 +1698,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return 0;
skipped:
max_sector = mddev->size << 1;
max_sector = mddev->dev_sectors;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) {
@@ -2020,6 +2023,25 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
goto skipped;
}
static sector_t
raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
sector_t size;
conf_t *conf = mddev_to_conf(mddev);
if (!raid_disks)
raid_disks = mddev->raid_disks;
if (!sectors)
sectors = mddev->dev_sectors;
size = sectors >> conf->chunk_shift;
sector_div(size, conf->far_copies);
size = size * raid_disks;
sector_div(size, conf->near_copies);
return size << conf->chunk_shift;
}
static int run(mddev_t *mddev)
{
conf_t *conf;
@@ -2076,7 +2098,7 @@ static int run(mddev_t *mddev)
conf->far_offset = fo;
conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
size = mddev->size >> (conf->chunk_shift-1);
size = mddev->dev_sectors >> conf->chunk_shift;
sector_div(size, fc);
size = size * conf->raid_disks;
sector_div(size, nc);
@@ -2089,7 +2111,7 @@ static int run(mddev_t *mddev)
*/
stride += conf->raid_disks - 1;
sector_div(stride, conf->raid_disks);
mddev->size = stride << (conf->chunk_shift-1);
mddev->dev_sectors = stride << conf->chunk_shift;
if (fo)
stride = 1;
@@ -2171,8 +2193,8 @@ static int run(mddev_t *mddev)
/*
* Ok, everything is just fine now
*/
mddev->array_sectors = size << conf->chunk_shift;
mddev->resync_max_sectors = size << conf->chunk_shift;
md_set_array_sectors(mddev, raid10_size(mddev, 0, 0));
mddev->resync_max_sectors = raid10_size(mddev, 0, 0);
mddev->queue->unplug_fn = raid10_unplug;
mddev->queue->backing_dev_info.congested_fn = raid10_congested;
@@ -2208,6 +2230,9 @@ static int stop(mddev_t *mddev)
{
conf_t *conf = mddev_to_conf(mddev);
raise_barrier(conf, 0);
lower_barrier(conf);
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
@@ -2255,6 +2280,7 @@ static struct mdk_personality raid10_personality =
.spare_active = raid10_spare_active,
.sync_request = sync_request,
.quiesce = raid10_quiesce,
.size = raid10_size,
};
static int __init raid_init(void)

121
drivers/md/raid10.h Normal file
查看文件

@@ -0,0 +1,121 @@
#ifndef _RAID10_H
#define _RAID10_H
typedef struct mirror_info mirror_info_t;
struct mirror_info {
mdk_rdev_t *rdev;
sector_t head_position;
};
typedef struct r10bio_s r10bio_t;
struct r10_private_data_s {
mddev_t *mddev;
mirror_info_t *mirrors;
int raid_disks;
spinlock_t device_lock;
/* geometry */
int near_copies; /* number of copies layed out raid0 style */
int far_copies; /* number of copies layed out
* at large strides across drives
*/
int far_offset; /* far_copies are offset by 1 stripe
* instead of many
*/
int copies; /* near_copies * far_copies.
* must be <= raid_disks
*/
sector_t stride; /* distance between far copies.
* This is size / far_copies unless
* far_offset, in which case it is
* 1 stripe.
*/
int chunk_shift; /* shift from chunks to sectors */
sector_t chunk_mask;
struct list_head retry_list;
/* queue pending writes and submit them on unplug */
struct bio_list pending_bio_list;
spinlock_t resync_lock;
int nr_pending;
int nr_waiting;
int nr_queued;
int barrier;
sector_t next_resync;
int fullsync; /* set to 1 if a full sync is needed,
* (fresh device added).
* Cleared when a sync completes.
*/
wait_queue_head_t wait_barrier;
mempool_t *r10bio_pool;
mempool_t *r10buf_pool;
struct page *tmppage;
};
typedef struct r10_private_data_s conf_t;
/*
* this is the only point in the RAID code where we violate
* C type safety. mddev->private is an 'opaque' pointer.
*/
#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
/*
* this is our 'private' RAID10 bio.
*
* it contains information about what kind of IO operations were started
* for this RAID10 operation, and about their status:
*/
struct r10bio_s {
atomic_t remaining; /* 'have we finished' count,
* used from IRQ handlers
*/
sector_t sector; /* virtual sector number */
int sectors;
unsigned long state;
mddev_t *mddev;
/*
* original bio going to /dev/mdx
*/
struct bio *master_bio;
/*
* if the IO is in READ direction, then this is where we read
*/
int read_slot;
struct list_head retry_list;
/*
* if the IO is in WRITE direction, then multiple bios are used,
* one for each copy.
* When resyncing we also use one for each copy.
* When reconstructing, we use 2 bios, one for read, one for write.
* We choose the number when they are allocated.
*/
struct {
struct bio *bio;
sector_t addr;
int devnum;
} devs[0];
};
/* when we get a read error on a read-only array, we redirect to another
* device without failing the first device, or trying to over-write to
* correct the read error. To keep track of bad blocks on a per-bio
* level, we store IO_BLOCKED in the appropriate 'bios' pointer
*/
#define IO_BLOCKED ((struct bio*)1)
/* bits for r10bio.state */
#define R10BIO_Uptodate 0
#define R10BIO_IsSync 1
#define R10BIO_IsRecover 2
#define R10BIO_Degraded 3
#endif

文件差異過大導致無法顯示 Load Diff

474
drivers/md/raid5.h Normal file
查看文件

@@ -0,0 +1,474 @@
#ifndef _RAID5_H
#define _RAID5_H
#include <linux/raid/xor.h>
/*
*
* Each stripe contains one buffer per disc. Each buffer can be in
* one of a number of states stored in "flags". Changes between
* these states happen *almost* exclusively under a per-stripe
* spinlock. Some very specific changes can happen in bi_end_io, and
* these are not protected by the spin lock.
*
* The flag bits that are used to represent these states are:
* R5_UPTODATE and R5_LOCKED
*
* State Empty == !UPTODATE, !LOCK
* We have no data, and there is no active request
* State Want == !UPTODATE, LOCK
* A read request is being submitted for this block
* State Dirty == UPTODATE, LOCK
* Some new data is in this buffer, and it is being written out
* State Clean == UPTODATE, !LOCK
* We have valid data which is the same as on disc
*
* The possible state transitions are:
*
* Empty -> Want - on read or write to get old data for parity calc
* Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
* Empty -> Clean - on compute_block when computing a block for failed drive
* Want -> Empty - on failed read
* Want -> Clean - on successful completion of read request
* Dirty -> Clean - on successful completion of write request
* Dirty -> Clean - on failed write
* Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
*
* The Want->Empty, Want->Clean, Dirty->Clean, transitions
* all happen in b_end_io at interrupt time.
* Each sets the Uptodate bit before releasing the Lock bit.
* This leaves one multi-stage transition:
* Want->Dirty->Clean
* This is safe because thinking that a Clean buffer is actually dirty
* will at worst delay some action, and the stripe will be scheduled
* for attention after the transition is complete.
*
* There is one possibility that is not covered by these states. That
* is if one drive has failed and there is a spare being rebuilt. We
* can't distinguish between a clean block that has been generated
* from parity calculations, and a clean block that has been
* successfully written to the spare ( or to parity when resyncing).
* To distingush these states we have a stripe bit STRIPE_INSYNC that
* is set whenever a write is scheduled to the spare, or to the parity
* disc if there is no spare. A sync request clears this bit, and
* when we find it set with no buffers locked, we know the sync is
* complete.
*
* Buffers for the md device that arrive via make_request are attached
* to the appropriate stripe in one of two lists linked on b_reqnext.
* One list (bh_read) for read requests, one (bh_write) for write.
* There should never be more than one buffer on the two lists
* together, but we are not guaranteed of that so we allow for more.
*
* If a buffer is on the read list when the associated cache buffer is
* Uptodate, the data is copied into the read buffer and it's b_end_io
* routine is called. This may happen in the end_request routine only
* if the buffer has just successfully been read. end_request should
* remove the buffers from the list and then set the Uptodate bit on
* the buffer. Other threads may do this only if they first check
* that the Uptodate bit is set. Once they have checked that they may
* take buffers off the read queue.
*
* When a buffer on the write list is committed for write it is copied
* into the cache buffer, which is then marked dirty, and moved onto a
* third list, the written list (bh_written). Once both the parity
* block and the cached buffer are successfully written, any buffer on
* a written list can be returned with b_end_io.
*
* The write list and read list both act as fifos. The read list is
* protected by the device_lock. The write and written lists are
* protected by the stripe lock. The device_lock, which can be
* claimed while the stipe lock is held, is only for list
* manipulations and will only be held for a very short time. It can
* be claimed from interrupts.
*
*
* Stripes in the stripe cache can be on one of two lists (or on
* neither). The "inactive_list" contains stripes which are not
* currently being used for any request. They can freely be reused
* for another stripe. The "handle_list" contains stripes that need
* to be handled in some way. Both of these are fifo queues. Each
* stripe is also (potentially) linked to a hash bucket in the hash
* table so that it can be found by sector number. Stripes that are
* not hashed must be on the inactive_list, and will normally be at
* the front. All stripes start life this way.
*
* The inactive_list, handle_list and hash bucket lists are all protected by the
* device_lock.
* - stripes on the inactive_list never have their stripe_lock held.
* - stripes have a reference counter. If count==0, they are on a list.
* - If a stripe might need handling, STRIPE_HANDLE is set.
* - When refcount reaches zero, then if STRIPE_HANDLE it is put on
* handle_list else inactive_list
*
* This, combined with the fact that STRIPE_HANDLE is only ever
* cleared while a stripe has a non-zero count means that if the
* refcount is 0 and STRIPE_HANDLE is set, then it is on the
* handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
* the stripe is on inactive_list.
*
* The possible transitions are:
* activate an unhashed/inactive stripe (get_active_stripe())
* lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
* activate a hashed, possibly active stripe (get_active_stripe())
* lockdev check-hash if(!cnt++)unlink-stripe unlockdev
* attach a request to an active stripe (add_stripe_bh())
* lockdev attach-buffer unlockdev
* handle a stripe (handle_stripe())
* lockstripe clrSTRIPE_HANDLE ...
* (lockdev check-buffers unlockdev) ..
* change-state ..
* record io/ops needed unlockstripe schedule io/ops
* release an active stripe (release_stripe())
* lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
*
* The refcount counts each thread that have activated the stripe,
* plus raid5d if it is handling it, plus one for each active request
* on a cached buffer, and plus one if the stripe is undergoing stripe
* operations.
*
* Stripe operations are performed outside the stripe lock,
* the stripe operations are:
* -copying data between the stripe cache and user application buffers
* -computing blocks to save a disk access, or to recover a missing block
* -updating the parity on a write operation (reconstruct write and
* read-modify-write)
* -checking parity correctness
* -running i/o to disk
* These operations are carried out by raid5_run_ops which uses the async_tx
* api to (optionally) offload operations to dedicated hardware engines.
* When requesting an operation handle_stripe sets the pending bit for the
* operation and increments the count. raid5_run_ops is then run whenever
* the count is non-zero.
* There are some critical dependencies between the operations that prevent some
* from being requested while another is in flight.
* 1/ Parity check operations destroy the in cache version of the parity block,
* so we prevent parity dependent operations like writes and compute_blocks
* from starting while a check is in progress. Some dma engines can perform
* the check without damaging the parity block, in these cases the parity
* block is re-marked up to date (assuming the check was successful) and is
* not re-read from disk.
* 2/ When a write operation is requested we immediately lock the affected
* blocks, and mark them as not up to date. This causes new read requests
* to be held off, as well as parity checks and compute block operations.
* 3/ Once a compute block operation has been requested handle_stripe treats
* that block as if it is up to date. raid5_run_ops guaruntees that any
* operation that is dependent on the compute block result is initiated after
* the compute block completes.
*/
/*
* Operations state - intermediate states that are visible outside of sh->lock
* In general _idle indicates nothing is running, _run indicates a data
* processing operation is active, and _result means the data processing result
* is stable and can be acted upon. For simple operations like biofill and
* compute that only have an _idle and _run state they are indicated with
* sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
*/
/**
* enum check_states - handles syncing / repairing a stripe
* @check_state_idle - check operations are quiesced
* @check_state_run - check operation is running
* @check_state_result - set outside lock when check result is valid
* @check_state_compute_run - check failed and we are repairing
* @check_state_compute_result - set outside lock when compute result is valid
*/
enum check_states {
check_state_idle = 0,
check_state_run, /* parity check */
check_state_check_result,
check_state_compute_run, /* parity repair */
check_state_compute_result,
};
/**
* enum reconstruct_states - handles writing or expanding a stripe
*/
enum reconstruct_states {
reconstruct_state_idle = 0,
reconstruct_state_prexor_drain_run, /* prexor-write */
reconstruct_state_drain_run, /* write */
reconstruct_state_run, /* expand */
reconstruct_state_prexor_drain_result,
reconstruct_state_drain_result,
reconstruct_state_result,
};
struct stripe_head {
struct hlist_node hash;
struct list_head lru; /* inactive_list or handle_list */
struct raid5_private_data *raid_conf;
short generation; /* increments with every
* reshape */
sector_t sector; /* sector of this row */
short pd_idx; /* parity disk index */
short qd_idx; /* 'Q' disk index for raid6 */
short ddf_layout;/* use DDF ordering to calculate Q */
unsigned long state; /* state flags */
atomic_t count; /* nr of active thread/requests */
spinlock_t lock;
int bm_seq; /* sequence number for bitmap flushes */
int disks; /* disks in stripe */
enum check_states check_state;
enum reconstruct_states reconstruct_state;
/* stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target
*/
struct stripe_operations {
int target;
u32 zero_sum_result;
} ops;
struct r5dev {
struct bio req;
struct bio_vec vec;
struct page *page;
struct bio *toread, *read, *towrite, *written;
sector_t sector; /* sector of this page */
unsigned long flags;
} dev[1]; /* allocated with extra space depending of RAID geometry */
};
/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
* for handle_stripe. It is only valid under spin_lock(sh->lock);
*/
struct stripe_head_state {
int syncing, expanding, expanded;
int locked, uptodate, to_read, to_write, failed, written;
int to_fill, compute, req_compute, non_overwrite;
int failed_num;
unsigned long ops_request;
};
/* r6_state - extra state data only relevant to r6 */
struct r6_state {
int p_failed, q_failed, failed_num[2];
};
/* Flags */
#define R5_UPTODATE 0 /* page contains current data */
#define R5_LOCKED 1 /* IO has been submitted on "req" */
#define R5_OVERWRITE 2 /* towrite covers whole page */
/* and some that are internal to handle_stripe */
#define R5_Insync 3 /* rdev && rdev->in_sync at start */
#define R5_Wantread 4 /* want to schedule a read */
#define R5_Wantwrite 5
#define R5_Overlap 7 /* There is a pending overlapping request on this block */
#define R5_ReadError 8 /* seen a read error here recently */
#define R5_ReWrite 9 /* have tried to over-write the readerror */
#define R5_Expanded 10 /* This block now has post-expand data */
#define R5_Wantcompute 11 /* compute_block in progress treat as
* uptodate
*/
#define R5_Wantfill 12 /* dev->toread contains a bio that needs
* filling
*/
#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
/*
* Write method
*/
#define RECONSTRUCT_WRITE 1
#define READ_MODIFY_WRITE 2
/* not a write method, but a compute_parity mode */
#define CHECK_PARITY 3
/* Additional compute_parity mode -- updates the parity w/o LOCKING */
#define UPDATE_PARITY 4
/*
* Stripe state
*/
#define STRIPE_HANDLE 2
#define STRIPE_SYNCING 3
#define STRIPE_INSYNC 4
#define STRIPE_PREREAD_ACTIVE 5
#define STRIPE_DELAYED 6
#define STRIPE_DEGRADED 7
#define STRIPE_BIT_DELAY 8
#define STRIPE_EXPANDING 9
#define STRIPE_EXPAND_SOURCE 10
#define STRIPE_EXPAND_READY 11
#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */
#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */
#define STRIPE_BIOFILL_RUN 14
#define STRIPE_COMPUTE_RUN 15
/*
* Operation request flags
*/
#define STRIPE_OP_BIOFILL 0
#define STRIPE_OP_COMPUTE_BLK 1
#define STRIPE_OP_PREXOR 2
#define STRIPE_OP_BIODRAIN 3
#define STRIPE_OP_POSTXOR 4
#define STRIPE_OP_CHECK 5
/*
* Plugging:
*
* To improve write throughput, we need to delay the handling of some
* stripes until there has been a chance that several write requests
* for the one stripe have all been collected.
* In particular, any write request that would require pre-reading
* is put on a "delayed" queue until there are no stripes currently
* in a pre-read phase. Further, if the "delayed" queue is empty when
* a stripe is put on it then we "plug" the queue and do not process it
* until an unplug call is made. (the unplug_io_fn() is called).
*
* When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
* it to the count of prereading stripes.
* When write is initiated, or the stripe refcnt == 0 (just in case) we
* clear the PREREAD_ACTIVE flag and decrement the count
* Whenever the 'handle' queue is empty and the device is not plugged, we
* move any strips from delayed to handle and clear the DELAYED flag and set
* PREREAD_ACTIVE.
* In stripe_handle, if we find pre-reading is necessary, we do it if
* PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
* HANDLE gets cleared if stripe_handle leave nothing locked.
*/
struct disk_info {
mdk_rdev_t *rdev;
};
struct raid5_private_data {
struct hlist_head *stripe_hashtbl;
mddev_t *mddev;
struct disk_info *spare;
int chunk_size, level, algorithm;
int max_degraded;
int raid_disks;
int max_nr_stripes;
/* reshape_progress is the leading edge of a 'reshape'
* It has value MaxSector when no reshape is happening
* If delta_disks < 0, it is the last sector we started work on,
* else is it the next sector to work on.
*/
sector_t reshape_progress;
/* reshape_safe is the trailing edge of a reshape. We know that
* before (or after) this address, all reshape has completed.
*/
sector_t reshape_safe;
int previous_raid_disks;
int prev_chunk, prev_algo;
short generation; /* increments with every reshape */
unsigned long reshape_checkpoint; /* Time we last updated
* metadata */
struct list_head handle_list; /* stripes needing handling */
struct list_head hold_list; /* preread ready stripes */
struct list_head delayed_list; /* stripes that have plugged requests */
struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */
struct bio *retry_read_aligned; /* currently retrying aligned bios */
struct bio *retry_read_aligned_list; /* aligned bios retry list */
atomic_t preread_active_stripes; /* stripes with scheduled io */
atomic_t active_aligned_reads;
atomic_t pending_full_writes; /* full write backlog */
int bypass_count; /* bypassed prereads */
int bypass_threshold; /* preread nice */
struct list_head *last_hold; /* detect hold_list promotions */
atomic_t reshape_stripes; /* stripes with pending writes for reshape */
/* unfortunately we need two cache names as we temporarily have
* two caches.
*/
int active_name;
char cache_name[2][20];
struct kmem_cache *slab_cache; /* for allocating stripes */
int seq_flush, seq_write;
int quiesce;
int fullsync; /* set to 1 if a full sync is needed,
* (fresh device added).
* Cleared when a sync completes.
*/
struct page *spare_page; /* Used when checking P/Q in raid6 */
/*
* Free stripes pool
*/
atomic_t active_stripes;
struct list_head inactive_list;
wait_queue_head_t wait_for_stripe;
wait_queue_head_t wait_for_overlap;
int inactive_blocked; /* release of inactive stripes blocked,
* waiting for 25% to be free
*/
int pool_size; /* number of disks in stripeheads in pool */
spinlock_t device_lock;
struct disk_info *disks;
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
*/
struct mdk_thread_s *thread;
};
typedef struct raid5_private_data raid5_conf_t;
#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
/*
* Our supported algorithms
*/
#define ALGORITHM_LEFT_ASYMMETRIC 0 /* Rotating Parity N with Data Restart */
#define ALGORITHM_RIGHT_ASYMMETRIC 1 /* Rotating Parity 0 with Data Restart */
#define ALGORITHM_LEFT_SYMMETRIC 2 /* Rotating Parity N with Data Continuation */
#define ALGORITHM_RIGHT_SYMMETRIC 3 /* Rotating Parity 0 with Data Continuation */
/* Define non-rotating (raid4) algorithms. These allow
* conversion of raid4 to raid5.
*/
#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */
#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */
/* DDF RAID6 layouts differ from md/raid6 layouts in two ways.
* Firstly, the exact positioning of the parity block is slightly
* different between the 'LEFT_*' modes of md and the "_N_*" modes
* of DDF.
* Secondly, or order of datablocks over which the Q syndrome is computed
* is different.
* Consequently we have different layouts for DDF/raid6 than md/raid6.
* These layouts are from the DDFv1.2 spec.
* Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but
* leaves RLQ=3 as 'Vendor Specific'
*/
#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */
#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */
#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */
/* For every RAID5 algorithm we define a RAID6 algorithm
* with exactly the same layout for data and parity, and
* with the Q block always on the last device (N-1).
* This allows trivial conversion from RAID5 to RAID6
*/
#define ALGORITHM_LEFT_ASYMMETRIC_6 16
#define ALGORITHM_RIGHT_ASYMMETRIC_6 17
#define ALGORITHM_LEFT_SYMMETRIC_6 18
#define ALGORITHM_RIGHT_SYMMETRIC_6 19
#define ALGORITHM_PARITY_0_6 20
#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N
static inline int algorithm_valid_raid5(int layout)
{
return (layout >= 0) &&
(layout <= 5);
}
static inline int algorithm_valid_raid6(int layout)
{
return (layout >= 0 && layout <= 5)
||
(layout == 8 || layout == 10)
||
(layout >= 16 && layout <= 20);
}
static inline int algorithm_is_DDF(int layout)
{
return layout >= 8 && layout <= 10;
}
#endif

查看文件

@@ -1,130 +0,0 @@
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright 2003 H. Peter Anvin - All Rights Reserved
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
#ifndef LINUX_RAID_RAID6_H
#define LINUX_RAID_RAID6_H
#ifdef __KERNEL__
/* Set to 1 to use kernel-wide empty_zero_page */
#define RAID6_USE_EMPTY_ZERO_PAGE 0
#include <linux/raid/md.h>
#include <linux/raid/raid5.h>
typedef raid5_conf_t raid6_conf_t; /* Same configuration */
/* Additional compute_parity mode -- updates the parity w/o LOCKING */
#define UPDATE_PARITY 4
/* We need a pre-zeroed page... if we don't want to use the kernel-provided
one define it here */
#if RAID6_USE_EMPTY_ZERO_PAGE
# define raid6_empty_zero_page empty_zero_page
#else
extern const char raid6_empty_zero_page[PAGE_SIZE];
#endif
#else /* ! __KERNEL__ */
/* Used for testing in user space */
#include <errno.h>
#include <inttypes.h>
#include <limits.h>
#include <stddef.h>
#include <sys/mman.h>
#include <sys/types.h>
/* Not standard, but glibc defines it */
#define BITS_PER_LONG __WORDSIZE
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
#ifndef PAGE_SIZE
# define PAGE_SIZE 4096
#endif
extern const char raid6_empty_zero_page[PAGE_SIZE];
#define __init
#define __exit
#define __attribute_const__ __attribute__((const))
#define noinline __attribute__((noinline))
#define preempt_enable()
#define preempt_disable()
#define cpu_has_feature(x) 1
#define enable_kernel_altivec()
#define disable_kernel_altivec()
#endif /* __KERNEL__ */
/* Routine choices */
struct raid6_calls {
void (*gen_syndrome)(int, size_t, void **);
int (*valid)(void); /* Returns 1 if this routine set is usable */
const char *name; /* Name of this routine set */
int prefer; /* Has special performance attribute */
};
/* Selected algorithm */
extern struct raid6_calls raid6_call;
/* Algorithm list */
extern const struct raid6_calls * const raid6_algos[];
int raid6_select_algo(void);
/* Return values from chk_syndrome */
#define RAID6_OK 0
#define RAID6_P_BAD 1
#define RAID6_Q_BAD 2
#define RAID6_PQ_BAD 3
/* Galois field tables */
extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
extern const u8 raid6_gfexp[256] __attribute__((aligned(256)));
extern const u8 raid6_gfinv[256] __attribute__((aligned(256)));
extern const u8 raid6_gfexi[256] __attribute__((aligned(256)));
/* Recovery routines */
void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
/* Some definitions to allow code to be compiled for testing in userspace */
#ifndef __KERNEL__
# define jiffies raid6_jiffies()
# define printk printf
# define GFP_KERNEL 0
# define __get_free_pages(x,y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0))
# define free_pages(x,y) munmap((void *)(x), (y)*PAGE_SIZE)
static inline void cpu_relax(void)
{
/* Nothing */
}
#undef HZ
#define HZ 1000
static inline uint32_t raid6_jiffies(void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec*1000 + tv.tv_usec/1000;
}
#endif /* ! __KERNEL__ */
#endif /* LINUX_RAID_RAID6_H */

查看文件

@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -16,13 +16,20 @@
* Algorithm list and algorithm selection for RAID-6
*/
#include "raid6.h"
#include <linux/raid/pq.h>
#ifndef __KERNEL__
#include <sys/mman.h>
#include <stdio.h>
#else
#if !RAID6_USE_EMPTY_ZERO_PAGE
/* In .bss so it's zeroed */
const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
EXPORT_SYMBOL(raid6_empty_zero_page);
#endif
#endif
struct raid6_calls raid6_call;
EXPORT_SYMBOL_GPL(raid6_call);
/* Various routine sets */
extern const struct raid6_calls raid6_intx1;
@@ -79,6 +86,7 @@ const struct raid6_calls * const raid6_algos[] = {
#else
/* Need more time to be stable in userspace */
#define RAID6_TIME_JIFFIES_LG2 9
#define time_before(x, y) ((x) < (y))
#endif
/* Try to pick the best algorithm */
@@ -152,3 +160,12 @@ int __init raid6_select_algo(void)
return best ? 0 : -EINVAL;
}
static void raid6_exit(void)
{
do { } while (0);
}
subsys_initcall(raid6_select_algo);
module_exit(raid6_exit);
MODULE_LICENSE("GPL");

查看文件

@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -22,7 +22,7 @@
* bracked this with preempt_disable/enable or in a lock)
*/
#include "raid6.h"
#include <linux/raid/pq.h>
#ifdef CONFIG_ALTIVEC

查看文件

@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
* This file is postprocessed using unroll.pl
*/
#include "raid6.h"
#include <linux/raid/pq.h>
/*
* This is the C data type to use

查看文件

@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
#if defined(__i386__) && !defined(__arch_um__)
#include "raid6.h"
#include <linux/raid/pq.h>
#include "raid6x86.h"
/* Shared with raid6sse1.c */

查看文件

@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
* the syndrome.)
*/
#include "raid6.h"
#include <linux/raid/pq.h>
/* Recover two failed data blocks. */
void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
@@ -63,9 +63,7 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
p++; q++;
}
}
EXPORT_SYMBOL_GPL(raid6_2data_recov);
/* Recover failure of one data block plus the P block */
void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
@@ -97,9 +95,10 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
q++; dq++;
}
}
EXPORT_SYMBOL_GPL(raid6_datap_recov);
#ifndef __KERNEL__ /* Testing only */
#ifndef __KERNEL__
/* Testing only */
/* Recover two failed blocks. */
void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs)

查看文件

@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -23,7 +23,7 @@
#if defined(__i386__) && !defined(__arch_um__)
#include "raid6.h"
#include <linux/raid/pq.h>
#include "raid6x86.h"
/* Defined in raid6mmx.c */

查看文件

@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -19,7 +19,7 @@
#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
#include "raid6.h"
#include <linux/raid/pq.h>
#include "raid6x86.h"
static const struct raid6_sse_constants {

查看文件

@@ -5,7 +5,7 @@
CC = gcc
OPTFLAGS = -O2 # Adjust as desired
CFLAGS = -I.. -g $(OPTFLAGS)
CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS)
LD = ld
PERL = perl
AR = ar

查看文件

@@ -17,7 +17,7 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "raid6.h"
#include <linux/raid/pq.h>
#define NDISKS 16 /* Including P and Q */

查看文件

@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */