--- zzzz-none-000/linux-2.4.17/drivers/md/raid1.c 2001-10-17 21:21:00.000000000 +0000 +++ sangam-fb-322/linux-2.4.17/drivers/md/raid1.c 2004-11-24 13:23:45.000000000 +0000 @@ -1,1830 +1,1830 @@ -/* - * raid1.c : Multiple Devices driver for Linux - * - * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat - * - * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman - * - * RAID-1 management functions. - * - * Better read-balancing code written by Mika Kuoppala , 2000 - * - * Fixes to reconstruction by Jakob Østergaard" - * Various fixes by Neil Brown - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include - -#define MAJOR_NR MD_MAJOR -#define MD_DRIVER -#define MD_PERSONALITY - -#define MAX_WORK_PER_DISK 128 - -#define NR_RESERVED_BUFS 32 - - -/* - * The following can be used to debug the driver - */ -#define RAID1_DEBUG 0 - -#if RAID1_DEBUG -#define PRINTK(x...) printk(x) -#define inline -#define __inline__ -#else -#define PRINTK(x...) do { } while (0) -#endif - - -static mdk_personality_t raid1_personality; -static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED; -struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail; - -static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt) -{ - /* return a linked list of "cnt" struct buffer_heads. - * don't take any off the free list unless we know we can - * get all we need, otherwise we could deadlock - */ - struct buffer_head *bh=NULL; - - while(cnt) { - struct buffer_head *t; - md_spin_lock_irq(&conf->device_lock); - if (!conf->freebh_blocked && conf->freebh_cnt >= cnt) - while (cnt) { - t = conf->freebh; - conf->freebh = t->b_next; - t->b_next = bh; - bh = t; - t->b_state = 0; - conf->freebh_cnt--; - cnt--; - } - md_spin_unlock_irq(&conf->device_lock); - if (cnt == 0) - break; - t = kmem_cache_alloc(bh_cachep, SLAB_NOIO); - if (t) { - t->b_next = bh; - bh = t; - cnt--; - } else { - PRINTK("raid1: waiting for %d bh\n", cnt); - conf->freebh_blocked = 1; - wait_disk_event(conf->wait_buffer, - !conf->freebh_blocked || - conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2); - conf->freebh_blocked = 0; - } - } - return bh; -} - -static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh) -{ - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - while (bh) { - struct buffer_head *t = bh; - bh=bh->b_next; - if (t->b_pprev == NULL) - kmem_cache_free(bh_cachep, t); - else { - t->b_next= conf->freebh; - conf->freebh = t; - conf->freebh_cnt++; - } - } - spin_unlock_irqrestore(&conf->device_lock, flags); - wake_up(&conf->wait_buffer); -} - -static int raid1_grow_bh(raid1_conf_t *conf, int cnt) -{ - /* allocate cnt buffer_heads, possibly less if kmalloc fails */ - int i = 0; - - while (i < cnt) { - struct buffer_head *bh; - bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL); - if (!bh) break; - - md_spin_lock_irq(&conf->device_lock); - bh->b_pprev = &conf->freebh; - bh->b_next = conf->freebh; - conf->freebh = bh; - conf->freebh_cnt++; - md_spin_unlock_irq(&conf->device_lock); - - i++; - } - return i; -} - -static void raid1_shrink_bh(raid1_conf_t *conf) -{ - /* discard all buffer_heads */ - - md_spin_lock_irq(&conf->device_lock); - while (conf->freebh) { - struct buffer_head *bh = conf->freebh; - conf->freebh = bh->b_next; - kmem_cache_free(bh_cachep, bh); - conf->freebh_cnt--; - } - md_spin_unlock_irq(&conf->device_lock); -} - - -static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf) -{ - struct raid1_bh *r1_bh = NULL; - - do { - md_spin_lock_irq(&conf->device_lock); - if (!conf->freer1_blocked && conf->freer1) { - r1_bh = conf->freer1; - conf->freer1 = r1_bh->next_r1; - conf->freer1_cnt--; - r1_bh->next_r1 = NULL; - r1_bh->state = (1 << R1BH_PreAlloc); - r1_bh->bh_req.b_state = 0; - } - md_spin_unlock_irq(&conf->device_lock); - if (r1_bh) - return r1_bh; - r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO); - if (r1_bh) { - memset(r1_bh, 0, sizeof(*r1_bh)); - return r1_bh; - } - conf->freer1_blocked = 1; - wait_disk_event(conf->wait_buffer, - !conf->freer1_blocked || - conf->freer1_cnt > NR_RESERVED_BUFS/2 - ); - conf->freer1_blocked = 0; - } while (1); -} - -static inline void raid1_free_r1bh(struct raid1_bh *r1_bh) -{ - struct buffer_head *bh = r1_bh->mirror_bh_list; - raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); - - r1_bh->mirror_bh_list = NULL; - - if (test_bit(R1BH_PreAlloc, &r1_bh->state)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - r1_bh->next_r1 = conf->freer1; - conf->freer1 = r1_bh; - conf->freer1_cnt++; - spin_unlock_irqrestore(&conf->device_lock, flags); - /* don't need to wakeup wait_buffer because - * raid1_free_bh below will do that - */ - } else { - kfree(r1_bh); - } - raid1_free_bh(conf, bh); -} - -static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt) -{ - int i = 0; - - while (i < cnt) { - struct raid1_bh *r1_bh; - r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL); - if (!r1_bh) - break; - memset(r1_bh, 0, sizeof(*r1_bh)); - set_bit(R1BH_PreAlloc, &r1_bh->state); - r1_bh->mddev = conf->mddev; - - raid1_free_r1bh(r1_bh); - i++; - } - return i; -} - -static void raid1_shrink_r1bh(raid1_conf_t *conf) -{ - md_spin_lock_irq(&conf->device_lock); - while (conf->freer1) { - struct raid1_bh *r1_bh = conf->freer1; - conf->freer1 = r1_bh->next_r1; - conf->freer1_cnt--; - kfree(r1_bh); - } - md_spin_unlock_irq(&conf->device_lock); -} - - - -static inline void raid1_free_buf(struct raid1_bh *r1_bh) -{ - unsigned long flags; - struct buffer_head *bh = r1_bh->mirror_bh_list; - raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); - r1_bh->mirror_bh_list = NULL; - - spin_lock_irqsave(&conf->device_lock, flags); - r1_bh->next_r1 = conf->freebuf; - conf->freebuf = r1_bh; - spin_unlock_irqrestore(&conf->device_lock, flags); - raid1_free_bh(conf, bh); -} - -static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf) -{ - struct raid1_bh *r1_bh; - - md_spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock); - r1_bh = conf->freebuf; - conf->freebuf = r1_bh->next_r1; - r1_bh->next_r1= NULL; - md_spin_unlock_irq(&conf->device_lock); - - return r1_bh; -} - -static int raid1_grow_buffers (raid1_conf_t *conf, int cnt) -{ - int i = 0; - - md_spin_lock_irq(&conf->device_lock); - while (i < cnt) { - struct raid1_bh *r1_bh; - struct page *page; - - page = alloc_page(GFP_KERNEL); - if (!page) - break; - - r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL); - if (!r1_bh) { - __free_page(page); - break; - } - memset(r1_bh, 0, sizeof(*r1_bh)); - r1_bh->bh_req.b_page = page; - r1_bh->bh_req.b_data = page_address(page); - r1_bh->next_r1 = conf->freebuf; - conf->freebuf = r1_bh; - i++; - } - md_spin_unlock_irq(&conf->device_lock); - return i; -} - -static void raid1_shrink_buffers (raid1_conf_t *conf) -{ - md_spin_lock_irq(&conf->device_lock); - while (conf->freebuf) { - struct raid1_bh *r1_bh = conf->freebuf; - conf->freebuf = r1_bh->next_r1; - __free_page(r1_bh->bh_req.b_page); - kfree(r1_bh); - } - md_spin_unlock_irq(&conf->device_lock); -} - -static int raid1_map (mddev_t *mddev, kdev_t *rdev) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - int i, disks = MD_SB_DISKS; - - /* - * Later we do read balancing on the read side - * now we use the first available disk. - */ - - for (i = 0; i < disks; i++) { - if (conf->mirrors[i].operational) { - *rdev = conf->mirrors[i].dev; - return (0); - } - } - - printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n"); - return (-1); -} - -static void raid1_reschedule_retry (struct raid1_bh *r1_bh) -{ - unsigned long flags; - mddev_t *mddev = r1_bh->mddev; - raid1_conf_t *conf = mddev_to_conf(mddev); - - md_spin_lock_irqsave(&retry_list_lock, flags); - if (raid1_retry_list == NULL) - raid1_retry_tail = &raid1_retry_list; - *raid1_retry_tail = r1_bh; - raid1_retry_tail = &r1_bh->next_r1; - r1_bh->next_r1 = NULL; - md_spin_unlock_irqrestore(&retry_list_lock, flags); - md_wakeup_thread(conf->thread); -} - - -static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase) -{ - unsigned long flags; - spin_lock_irqsave(&conf->segment_lock, flags); - if (sector < conf->start_active) - conf->cnt_done--; - else if (sector >= conf->start_future && conf->phase == phase) - conf->cnt_future--; - else if (!--conf->cnt_pending) - wake_up(&conf->wait_ready); - - spin_unlock_irqrestore(&conf->segment_lock, flags); -} - -static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf) -{ - unsigned long flags; - spin_lock_irqsave(&conf->segment_lock, flags); - if (sector >= conf->start_ready) - --conf->cnt_ready; - else if (sector >= conf->start_active) { - if (!--conf->cnt_active) { - conf->start_active = conf->start_ready; - wake_up(&conf->wait_done); - } - } - spin_unlock_irqrestore(&conf->segment_lock, flags); -} - -/* - * raid1_end_bh_io() is called when we have finished servicing a mirrored - * operation and are ready to return a success/failure code to the buffer - * cache layer. - */ -static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate) -{ - struct buffer_head *bh = r1_bh->master_bh; - - io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev), - test_bit(R1BH_SyncPhase, &r1_bh->state)); - - bh->b_end_io(bh, uptodate); - raid1_free_r1bh(r1_bh); -} -void raid1_end_request (struct buffer_head *bh, int uptodate) -{ - struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); - - /* - * this branch is our 'one mirror IO has finished' event handler: - */ - if (!uptodate) - md_error (r1_bh->mddev, bh->b_dev); - else - /* - * Set R1BH_Uptodate in our master buffer_head, so that - * we will return a good error code for to the higher - * levels even if IO on some other mirrored buffer fails. - * - * The 'master' represents the complex operation to - * user-side. So if something waits for IO, then it will - * wait for the 'master' buffer_head. - */ - set_bit (R1BH_Uptodate, &r1_bh->state); - - /* - * We split up the read and write side, imho they are - * conceptually different. - */ - - if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) { - /* - * we have only one buffer_head on the read side - */ - - if (uptodate) { - raid1_end_bh_io(r1_bh, uptodate); - return; - } - /* - * oops, read error: - */ - printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", - partition_name(bh->b_dev), bh->b_blocknr); - raid1_reschedule_retry(r1_bh); - return; - } - - /* - * WRITE: - * - * Let's see if all mirrored write operations have finished - * already. - */ - - if (atomic_dec_and_test(&r1_bh->remaining)) - raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state)); -} - -/* - * This routine returns the disk from which the requested read should - * be done. It bookkeeps the last read position for every disk - * in array and when new read requests come, the disk which last - * position is nearest to the request, is chosen. - * - * TODO: now if there are 2 mirrors in the same 2 devices, performance - * degrades dramatically because position is mirror, not device based. - * This should be changed to be device based. Also atomic sequential - * reads should be somehow balanced. - */ - -static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh) -{ - int new_disk = conf->last_used; - const int sectors = bh->b_size >> 9; - const unsigned long this_sector = bh->b_rsector; - int disk = new_disk; - unsigned long new_distance; - unsigned long current_distance; - - /* - * Check if it is sane at all to balance - */ - - if (conf->resync_mirrors) - goto rb_out; - - - /* make sure that disk is operational */ - while( !conf->mirrors[new_disk].operational) { - if (new_disk <= 0) new_disk = conf->raid_disks; - new_disk--; - if (new_disk == disk) { - /* - * This means no working disk was found - * Nothing much to do, lets not change anything - * and hope for the best... - */ - - new_disk = conf->last_used; - - goto rb_out; - } - } - disk = new_disk; - /* now disk == new_disk == starting point for search */ - - /* - * Don't touch anything for sequential reads. - */ - - if (this_sector == conf->mirrors[new_disk].head_position) - goto rb_out; - - /* - * If reads have been done only on a single disk - * for a time, lets give another disk a change. - * This is for kicking those idling disks so that - * they would find work near some hotspot. - */ - - if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) { - conf->sect_count = 0; - - do { - if (new_disk<=0) - new_disk = conf->raid_disks; - new_disk--; - if (new_disk == disk) - break; - } while ((conf->mirrors[new_disk].write_only) || - (!conf->mirrors[new_disk].operational)); - - goto rb_out; - } - - current_distance = abs(this_sector - - conf->mirrors[disk].head_position); - - /* Find the disk which is closest */ - - do { - if (disk <= 0) - disk = conf->raid_disks; - disk--; - - if ((conf->mirrors[disk].write_only) || - (!conf->mirrors[disk].operational)) - continue; - - new_distance = abs(this_sector - - conf->mirrors[disk].head_position); - - if (new_distance < current_distance) { - conf->sect_count = 0; - current_distance = new_distance; - new_disk = disk; - } - } while (disk != conf->last_used); - -rb_out: - conf->mirrors[new_disk].head_position = this_sector + sectors; - - conf->last_used = new_disk; - conf->sect_count += sectors; - - return new_disk; -} - -static int raid1_make_request (mddev_t *mddev, int rw, - struct buffer_head * bh) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - struct buffer_head *bh_req, *bhl; - struct raid1_bh * r1_bh; - int disks = MD_SB_DISKS; - int i, sum_bhs = 0; - struct mirror_info *mirror; - - if (!buffer_locked(bh)) - BUG(); - -/* - * make_request() can abort the operation when READA is being - * used and no empty request is available. - * - * Currently, just replace the command with READ/WRITE. - */ - if (rw == READA) - rw = READ; - - r1_bh = raid1_alloc_r1bh (conf); - - spin_lock_irq(&conf->segment_lock); - wait_event_lock_irq(conf->wait_done, - bh->b_rsector < conf->start_active || - bh->b_rsector >= conf->start_future, - conf->segment_lock); - if (bh->b_rsector < conf->start_active) - conf->cnt_done++; - else { - conf->cnt_future++; - if (conf->phase) - set_bit(R1BH_SyncPhase, &r1_bh->state); - } - spin_unlock_irq(&conf->segment_lock); - - /* - * i think the read and write branch should be separated completely, - * since we want to do read balancing on the read side for example. - * Alternative implementations? :) --mingo - */ - - r1_bh->master_bh = bh; - r1_bh->mddev = mddev; - r1_bh->cmd = rw; - - if (rw == READ) { - /* - * read balancing logic: - */ - mirror = conf->mirrors + raid1_read_balance(conf, bh); - - bh_req = &r1_bh->bh_req; - memcpy(bh_req, bh, sizeof(*bh)); - bh_req->b_blocknr = bh->b_rsector; - bh_req->b_dev = mirror->dev; - bh_req->b_rdev = mirror->dev; - /* bh_req->b_rsector = bh->n_rsector; */ - bh_req->b_end_io = raid1_end_request; - bh_req->b_private = r1_bh; - generic_make_request (rw, bh_req); - return 0; - } - - /* - * WRITE: - */ - - bhl = raid1_alloc_bh(conf, conf->raid_disks); - for (i = 0; i < disks; i++) { - struct buffer_head *mbh; - if (!conf->mirrors[i].operational) - continue; - - /* - * We should use a private pool (size depending on NR_REQUEST), - * to avoid writes filling up the memory with bhs - * - * Such pools are much faster than kmalloc anyways (so we waste - * almost nothing by not using the master bh when writing and - * win alot of cleanness) but for now we are cool enough. --mingo - * - * It's safe to sleep here, buffer heads cannot be used in a shared - * manner in the write branch. Look how we lock the buffer at the - * beginning of this function to grok the difference ;) - */ - mbh = bhl; - if (mbh == NULL) { - MD_BUG(); - break; - } - bhl = mbh->b_next; - mbh->b_next = NULL; - mbh->b_this_page = (struct buffer_head *)1; - - /* - * prepare mirrored mbh (fields ordered for max mem throughput): - */ - mbh->b_blocknr = bh->b_rsector; - mbh->b_dev = conf->mirrors[i].dev; - mbh->b_rdev = conf->mirrors[i].dev; - mbh->b_rsector = bh->b_rsector; - mbh->b_state = (1<b_count, 1); - mbh->b_size = bh->b_size; - mbh->b_page = bh->b_page; - mbh->b_data = bh->b_data; - mbh->b_list = BUF_LOCKED; - mbh->b_end_io = raid1_end_request; - mbh->b_private = r1_bh; - - mbh->b_next = r1_bh->mirror_bh_list; - r1_bh->mirror_bh_list = mbh; - sum_bhs++; - } - if (bhl) raid1_free_bh(conf,bhl); - if (!sum_bhs) { - /* Gag - all mirrors non-operational.. */ - raid1_end_bh_io(r1_bh, 0); - return 0; - } - md_atomic_set(&r1_bh->remaining, sum_bhs); - - /* - * We have to be a bit careful about the semaphore above, thats - * why we start the requests separately. Since kmalloc() could - * fail, sleep and make_request() can sleep too, this is the - * safer solution. Imagine, end_request decreasing the semaphore - * before we could have set it up ... We could play tricks with - * the semaphore (presetting it and correcting at the end if - * sum_bhs is not 'n' but we have to do end_request by hand if - * all requests finish until we had a chance to set up the - * semaphore correctly ... lots of races). - */ - bh = r1_bh->mirror_bh_list; - while(bh) { - struct buffer_head *bh2 = bh; - bh = bh->b_next; - generic_make_request(rw, bh2); - } - return (0); -} - -static int raid1_status (char *page, mddev_t *mddev) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - int sz = 0, i; - - sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, - conf->working_disks); - for (i = 0; i < conf->raid_disks; i++) - sz += sprintf (page+sz, "%s", - conf->mirrors[i].operational ? "U" : "_"); - sz += sprintf (page+sz, "]"); - return sz; -} - -#define LAST_DISK KERN_ALERT \ -"raid1: only one disk left and IO error.\n" - -#define NO_SPARE_DISK KERN_ALERT \ -"raid1: no spare disk left, degrading mirror level by one.\n" - -#define DISK_FAILED KERN_ALERT \ -"raid1: Disk failure on %s, disabling device. \n" \ -" Operation continuing on %d devices\n" - -#define START_SYNCING KERN_ALERT \ -"raid1: start syncing spare disk.\n" - -#define ALREADY_SYNCING KERN_INFO \ -"raid1: syncing already in progress.\n" - -static void mark_disk_bad (mddev_t *mddev, int failed) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - struct mirror_info *mirror = conf->mirrors+failed; - mdp_super_t *sb = mddev->sb; - - mirror->operational = 0; - mark_disk_faulty(sb->disks+mirror->number); - mark_disk_nonsync(sb->disks+mirror->number); - mark_disk_inactive(sb->disks+mirror->number); - if (!mirror->write_only) - sb->active_disks--; - sb->working_disks--; - sb->failed_disks++; - mddev->sb_dirty = 1; - md_wakeup_thread(conf->thread); - if (!mirror->write_only) - conf->working_disks--; - printk (DISK_FAILED, partition_name (mirror->dev), - conf->working_disks); -} - -static int raid1_error (mddev_t *mddev, kdev_t dev) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - struct mirror_info * mirrors = conf->mirrors; - int disks = MD_SB_DISKS; - int i; - - /* Find the drive. - * If it is not operational, then we have already marked it as dead - * else if it is the last working disks, ignore the error, let the - * next level up know. - * else mark the drive as failed - */ - - for (i = 0; i < disks; i++) - if (mirrors[i].dev==dev && mirrors[i].operational) - break; - if (i == disks) - return 0; - - if (i < conf->raid_disks && conf->working_disks == 1) { - /* Don't fail the drive, act as though we were just a - * normal single drive - */ - - return 1; - } - mark_disk_bad(mddev, i); - return 0; -} - -#undef LAST_DISK -#undef NO_SPARE_DISK -#undef DISK_FAILED -#undef START_SYNCING - - -static void print_raid1_conf (raid1_conf_t *conf) -{ - int i; - struct mirror_info *tmp; - - printk("RAID1 conf printout:\n"); - if (!conf) { - printk("(conf==NULL)\n"); - return; - } - printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks, - conf->raid_disks, conf->nr_disks); - - for (i = 0; i < MD_SB_DISKS; i++) { - tmp = conf->mirrors + i; - printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", - i, tmp->spare,tmp->operational, - tmp->number,tmp->raid_disk,tmp->used_slot, - partition_name(tmp->dev)); - } -} - -static void close_sync(raid1_conf_t *conf) -{ - mddev_t *mddev = conf->mddev; - /* If reconstruction was interrupted, we need to close the "active" and "pending" - * holes. - * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0 - */ - /* this is really needed when recovery stops too... */ - spin_lock_irq(&conf->segment_lock); - conf->start_active = conf->start_pending; - conf->start_ready = conf->start_pending; - wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); - conf->start_active =conf->start_ready = conf->start_pending = conf->start_future; - conf->start_future = mddev->sb->size+1; - conf->cnt_pending = conf->cnt_future; - conf->cnt_future = 0; - conf->phase = conf->phase ^1; - wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); - conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0; - conf->phase = 0; - conf->cnt_future = conf->cnt_done;; - conf->cnt_done = 0; - spin_unlock_irq(&conf->segment_lock); - wake_up(&conf->wait_done); -} - -static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state) -{ - int err = 0; - int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; - raid1_conf_t *conf = mddev->private; - struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; - mdp_super_t *sb = mddev->sb; - mdp_disk_t *failed_desc, *spare_desc, *added_desc; - mdk_rdev_t *spare_rdev, *failed_rdev; - - print_raid1_conf(conf); - md_spin_lock_irq(&conf->device_lock); - /* - * find the disk ... - */ - switch (state) { - - case DISKOP_SPARE_ACTIVE: - - /* - * Find the failed disk within the RAID1 configuration ... - * (this can only be in the first conf->working_disks part) - */ - for (i = 0; i < conf->raid_disks; i++) { - tmp = conf->mirrors + i; - if ((!tmp->operational && !tmp->spare) || - !tmp->used_slot) { - failed_disk = i; - break; - } - } - /* - * When we activate a spare disk we _must_ have a disk in - * the lower (active) part of the array to replace. - */ - if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { - MD_BUG(); - err = 1; - goto abort; - } - /* fall through */ - - case DISKOP_SPARE_WRITE: - case DISKOP_SPARE_INACTIVE: - - /* - * Find the spare disk ... (can only be in the 'high' - * area of the array) - */ - for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { - tmp = conf->mirrors + i; - if (tmp->spare && tmp->number == (*d)->number) { - spare_disk = i; - break; - } - } - if (spare_disk == -1) { - MD_BUG(); - err = 1; - goto abort; - } - break; - - case DISKOP_HOT_REMOVE_DISK: - - for (i = 0; i < MD_SB_DISKS; i++) { - tmp = conf->mirrors + i; - if (tmp->used_slot && (tmp->number == (*d)->number)) { - if (tmp->operational) { - err = -EBUSY; - goto abort; - } - removed_disk = i; - break; - } - } - if (removed_disk == -1) { - MD_BUG(); - err = 1; - goto abort; - } - break; - - case DISKOP_HOT_ADD_DISK: - - for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { - tmp = conf->mirrors + i; - if (!tmp->used_slot) { - added_disk = i; - break; - } - } - if (added_disk == -1) { - MD_BUG(); - err = 1; - goto abort; - } - break; - } - - switch (state) { - /* - * Switch the spare disk to write-only mode: - */ - case DISKOP_SPARE_WRITE: - sdisk = conf->mirrors + spare_disk; - sdisk->operational = 1; - sdisk->write_only = 1; - break; - /* - * Deactivate a spare disk: - */ - case DISKOP_SPARE_INACTIVE: - close_sync(conf); - sdisk = conf->mirrors + spare_disk; - sdisk->operational = 0; - sdisk->write_only = 0; - break; - /* - * Activate (mark read-write) the (now sync) spare disk, - * which means we switch it's 'raid position' (->raid_disk) - * with the failed disk. (only the first 'conf->nr_disks' - * slots are used for 'real' disks and we must preserve this - * property) - */ - case DISKOP_SPARE_ACTIVE: - close_sync(conf); - sdisk = conf->mirrors + spare_disk; - fdisk = conf->mirrors + failed_disk; - - spare_desc = &sb->disks[sdisk->number]; - failed_desc = &sb->disks[fdisk->number]; - - if (spare_desc != *d) { - MD_BUG(); - err = 1; - goto abort; - } - - if (spare_desc->raid_disk != sdisk->raid_disk) { - MD_BUG(); - err = 1; - goto abort; - } - - if (sdisk->raid_disk != spare_disk) { - MD_BUG(); - err = 1; - goto abort; - } - - if (failed_desc->raid_disk != fdisk->raid_disk) { - MD_BUG(); - err = 1; - goto abort; - } - - if (fdisk->raid_disk != failed_disk) { - MD_BUG(); - err = 1; - goto abort; - } - - /* - * do the switch finally - */ - spare_rdev = find_rdev_nr(mddev, spare_desc->number); - failed_rdev = find_rdev_nr(mddev, failed_desc->number); - - /* There must be a spare_rdev, but there may not be a - * failed_rdev. That slot might be empty... - */ - spare_rdev->desc_nr = failed_desc->number; - if (failed_rdev) - failed_rdev->desc_nr = spare_desc->number; - - xchg_values(*spare_desc, *failed_desc); - xchg_values(*fdisk, *sdisk); - - /* - * (careful, 'failed' and 'spare' are switched from now on) - * - * we want to preserve linear numbering and we want to - * give the proper raid_disk number to the now activated - * disk. (this means we switch back these values) - */ - - xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); - xchg_values(sdisk->raid_disk, fdisk->raid_disk); - xchg_values(spare_desc->number, failed_desc->number); - xchg_values(sdisk->number, fdisk->number); - - *d = failed_desc; - - if (sdisk->dev == MKDEV(0,0)) - sdisk->used_slot = 0; - /* - * this really activates the spare. - */ - fdisk->spare = 0; - fdisk->write_only = 0; - - /* - * if we activate a spare, we definitely replace a - * non-operational disk slot in the 'low' area of - * the disk array. - */ - - conf->working_disks++; - - break; - - case DISKOP_HOT_REMOVE_DISK: - rdisk = conf->mirrors + removed_disk; - - if (rdisk->spare && (removed_disk < conf->raid_disks)) { - MD_BUG(); - err = 1; - goto abort; - } - rdisk->dev = MKDEV(0,0); - rdisk->used_slot = 0; - conf->nr_disks--; - break; - - case DISKOP_HOT_ADD_DISK: - adisk = conf->mirrors + added_disk; - added_desc = *d; - - if (added_disk != added_desc->number) { - MD_BUG(); - err = 1; - goto abort; - } - - adisk->number = added_desc->number; - adisk->raid_disk = added_desc->raid_disk; - adisk->dev = MKDEV(added_desc->major,added_desc->minor); - - adisk->operational = 0; - adisk->write_only = 0; - adisk->spare = 1; - adisk->used_slot = 1; - adisk->head_position = 0; - conf->nr_disks++; - - break; - - default: - MD_BUG(); - err = 1; - goto abort; - } -abort: - md_spin_unlock_irq(&conf->device_lock); - if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) - /* should move to "END_REBUILD" when such exists */ - raid1_shrink_buffers(conf); - - print_raid1_conf(conf); - return err; -} - - -#define IO_ERROR KERN_ALERT \ -"raid1: %s: unrecoverable I/O read error for block %lu\n" - -#define REDIRECT_SECTOR KERN_ERR \ -"raid1: %s: redirecting sector %lu to another mirror\n" - -/* - * This is a kernel thread which: - * - * 1. Retries failed read operations on working mirrors. - * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array syncronising. - */ -static void end_sync_write(struct buffer_head *bh, int uptodate); -static void end_sync_read(struct buffer_head *bh, int uptodate); - -static void raid1d (void *data) -{ - struct raid1_bh *r1_bh; - struct buffer_head *bh; - unsigned long flags; - mddev_t *mddev; - kdev_t dev; - - - for (;;) { - md_spin_lock_irqsave(&retry_list_lock, flags); - r1_bh = raid1_retry_list; - if (!r1_bh) - break; - raid1_retry_list = r1_bh->next_r1; - md_spin_unlock_irqrestore(&retry_list_lock, flags); - - mddev = r1_bh->mddev; - if (mddev->sb_dirty) { - printk(KERN_INFO "raid1: dirty sb detected, updating.\n"); - mddev->sb_dirty = 0; - md_update_sb(mddev); - } - bh = &r1_bh->bh_req; - switch(r1_bh->cmd) { - case SPECIAL: - /* have to allocate lots of bh structures and - * schedule writes - */ - if (test_bit(R1BH_Uptodate, &r1_bh->state)) { - int i, sum_bhs = 0; - int disks = MD_SB_DISKS; - struct buffer_head *bhl, *mbh; - raid1_conf_t *conf; - - conf = mddev_to_conf(mddev); - bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */ - for (i = 0; i < disks ; i++) { - if (!conf->mirrors[i].operational) - continue; - if (i==conf->last_used) - /* we read from here, no need to write */ - continue; - if (i < conf->raid_disks - && !conf->resync_mirrors) - /* don't need to write this, - * we are just rebuilding */ - continue; - mbh = bhl; - if (!mbh) { - MD_BUG(); - break; - } - bhl = mbh->b_next; - mbh->b_this_page = (struct buffer_head *)1; - - - /* - * prepare mirrored bh (fields ordered for max mem throughput): - */ - mbh->b_blocknr = bh->b_blocknr; - mbh->b_dev = conf->mirrors[i].dev; - mbh->b_rdev = conf->mirrors[i].dev; - mbh->b_rsector = bh->b_blocknr; - mbh->b_state = (1<b_count, 1); - mbh->b_size = bh->b_size; - mbh->b_page = bh->b_page; - mbh->b_data = bh->b_data; - mbh->b_list = BUF_LOCKED; - mbh->b_end_io = end_sync_write; - mbh->b_private = r1_bh; - - mbh->b_next = r1_bh->mirror_bh_list; - r1_bh->mirror_bh_list = mbh; - - sum_bhs++; - } - md_atomic_set(&r1_bh->remaining, sum_bhs); - if (bhl) raid1_free_bh(conf, bhl); - mbh = r1_bh->mirror_bh_list; - - if (!sum_bhs) { - /* nowhere to write this too... I guess we - * must be done - */ - sync_request_done(bh->b_blocknr, conf); - md_done_sync(mddev, bh->b_size>>9, 0); - raid1_free_buf(r1_bh); - } else - while (mbh) { - struct buffer_head *bh1 = mbh; - mbh = mbh->b_next; - generic_make_request(WRITE, bh1); - md_sync_acct(bh1->b_dev, bh1->b_size/512); - } - } else { - /* There is no point trying a read-for-reconstruct - * as reconstruct is about to be aborted - */ - - printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); - md_done_sync(mddev, bh->b_size>>9, 0); - } - - break; - case READ: - case READA: - dev = bh->b_dev; - raid1_map (mddev, &bh->b_dev); - if (bh->b_dev == dev) { - printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); - raid1_end_bh_io(r1_bh, 0); - } else { - printk (REDIRECT_SECTOR, - partition_name(bh->b_dev), bh->b_blocknr); - bh->b_rdev = bh->b_dev; - bh->b_rsector = bh->b_blocknr; - generic_make_request (r1_bh->cmd, bh); - } - break; - } - } - md_spin_unlock_irqrestore(&retry_list_lock, flags); -} -#undef IO_ERROR -#undef REDIRECT_SECTOR - -/* - * Private kernel thread to reconstruct mirrors after an unclean - * shutdown. - */ -static void raid1syncd (void *data) -{ - raid1_conf_t *conf = data; - mddev_t *mddev = conf->mddev; - - if (!conf->resync_mirrors) - return; - if (conf->resync_mirrors == 2) - return; - down(&mddev->recovery_sem); - if (!md_do_sync(mddev, NULL)) { - /* - * Only if everything went Ok. - */ - conf->resync_mirrors = 0; - } - - close_sync(conf); - - up(&mddev->recovery_sem); - raid1_shrink_buffers(conf); -} - -/* - * perform a "sync" on one "block" - * - * We need to make sure that no normal I/O request - particularly write - * requests - conflict with active sync requests. - * This is achieved by conceptually dividing the device space into a - * number of sections: - * DONE: 0 .. a-1 These blocks are in-sync - * ACTIVE: a.. b-1 These blocks may have active sync requests, but - * no normal IO requests - * READY: b .. c-1 These blocks have no normal IO requests - sync - * request may be happening - * PENDING: c .. d-1 These blocks may have IO requests, but no new - * ones will be added - * FUTURE: d .. end These blocks are not to be considered yet. IO may - * be happening, but not sync - * - * We keep a - * phase which flips (0 or 1) each time d moves and - * a count of: - * z = active io requests in FUTURE since d moved - marked with - * current phase - * y = active io requests in FUTURE before d moved, or PENDING - - * marked with previous phase - * x = active sync requests in READY - * w = active sync requests in ACTIVE - * v = active io requests in DONE - * - * Normally, a=b=c=d=0 and z= active io requests - * or a=b=c=d=END and v= active io requests - * Allowed changes to a,b,c,d: - * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase - * B: y==0 -> c=d - * C: b=c, w+=x, x=0 - * D: w==0 -> a=b - * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0 - * - * At start of sync we apply A. - * When y reaches 0, we apply B then A then being sync requests - * When sync point reaches c-1, we wait for y==0, and W==0, and - * then apply apply B then A then D then C. - * Finally, we apply E - * - * The sync request simply issues a "read" against a working drive - * This is marked so that on completion the raid1d thread is woken to - * issue suitable write requests - */ - -static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - struct mirror_info *mirror; - struct raid1_bh *r1_bh; - struct buffer_head *bh; - int bsize; - int disk; - int block_nr; - - spin_lock_irq(&conf->segment_lock); - if (!sector_nr) { - /* initialize ...*/ - int buffs; - conf->start_active = 0; - conf->start_ready = 0; - conf->start_pending = 0; - conf->start_future = 0; - conf->phase = 0; - /* we want enough buffers to hold twice the window of 128*/ - buffs = 128 *2 / (PAGE_SIZE>>9); - buffs = raid1_grow_buffers(conf, buffs); - if (buffs < 2) - goto nomem; - - conf->window = buffs*(PAGE_SIZE>>9)/2; - conf->cnt_future += conf->cnt_done+conf->cnt_pending; - conf->cnt_done = conf->cnt_pending = 0; - if (conf->cnt_ready || conf->cnt_active) - MD_BUG(); - } - while (sector_nr >= conf->start_pending) { - PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n", - sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future, - conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future); - wait_event_lock_irq(conf->wait_done, - !conf->cnt_active, - conf->segment_lock); - wait_event_lock_irq(conf->wait_ready, - !conf->cnt_pending, - conf->segment_lock); - conf->start_active = conf->start_ready; - conf->start_ready = conf->start_pending; - conf->start_pending = conf->start_future; - conf->start_future = conf->start_future+conf->window; - // Note: falling off the end is not a problem - conf->phase = conf->phase ^1; - conf->cnt_active = conf->cnt_ready; - conf->cnt_ready = 0; - conf->cnt_pending = conf->cnt_future; - conf->cnt_future = 0; - wake_up(&conf->wait_done); - } - conf->cnt_ready++; - spin_unlock_irq(&conf->segment_lock); - - - /* If reconstructing, and >1 working disc, - * could dedicate one to rebuild and others to - * service read requests .. - */ - disk = conf->last_used; - /* make sure disk is operational */ - while (!conf->mirrors[disk].operational) { - if (disk <= 0) disk = conf->raid_disks; - disk--; - if (disk == conf->last_used) - break; - } - conf->last_used = disk; - - mirror = conf->mirrors+conf->last_used; - - r1_bh = raid1_alloc_buf (conf); - r1_bh->master_bh = NULL; - r1_bh->mddev = mddev; - r1_bh->cmd = SPECIAL; - bh = &r1_bh->bh_req; - - block_nr = sector_nr; - bsize = 512; - while (!(block_nr & 1) && bsize < PAGE_SIZE - && (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) { - block_nr >>= 1; - bsize <<= 1; - } - bh->b_size = bsize; - bh->b_list = BUF_LOCKED; - bh->b_dev = mirror->dev; - bh->b_rdev = mirror->dev; - bh->b_state = (1<b_page) - BUG(); - if (!bh->b_data) - BUG(); - if (bh->b_data != page_address(bh->b_page)) - BUG(); - bh->b_end_io = end_sync_read; - bh->b_private = r1_bh; - bh->b_blocknr = sector_nr; - bh->b_rsector = sector_nr; - init_waitqueue_head(&bh->b_wait); - - generic_make_request(READ, bh); - md_sync_acct(bh->b_dev, bh->b_size/512); - - return (bsize >> 9); - -nomem: - raid1_shrink_buffers(conf); - spin_unlock_irq(&conf->segment_lock); - return -ENOMEM; -} - -static void end_sync_read(struct buffer_head *bh, int uptodate) -{ - struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); - - /* we have read a block, now it needs to be re-written, - * or re-read if the read failed. - * We don't do much here, just schedule handling by raid1d - */ - if (!uptodate) - md_error (r1_bh->mddev, bh->b_dev); - else - set_bit(R1BH_Uptodate, &r1_bh->state); - raid1_reschedule_retry(r1_bh); -} - -static void end_sync_write(struct buffer_head *bh, int uptodate) -{ - struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); - - if (!uptodate) - md_error (r1_bh->mddev, bh->b_dev); - if (atomic_dec_and_test(&r1_bh->remaining)) { - mddev_t *mddev = r1_bh->mddev; - unsigned long sect = bh->b_blocknr; - int size = bh->b_size; - raid1_free_buf(r1_bh); - sync_request_done(sect, mddev_to_conf(mddev)); - md_done_sync(mddev,size>>9, uptodate); - } -} - -#define INVALID_LEVEL KERN_WARNING \ -"raid1: md%d: raid level not set to mirroring (%d)\n" - -#define NO_SB KERN_ERR \ -"raid1: disabled mirror %s (couldn't access raid superblock)\n" - -#define ERRORS KERN_ERR \ -"raid1: disabled mirror %s (errors detected)\n" - -#define NOT_IN_SYNC KERN_ERR \ -"raid1: disabled mirror %s (not in sync)\n" - -#define INCONSISTENT KERN_ERR \ -"raid1: disabled mirror %s (inconsistent descriptor)\n" - -#define ALREADY_RUNNING KERN_ERR \ -"raid1: disabled mirror %s (mirror %d already operational)\n" - -#define OPERATIONAL KERN_INFO \ -"raid1: device %s operational as mirror %d\n" - -#define MEM_ERROR KERN_ERR \ -"raid1: couldn't allocate memory for md%d\n" - -#define SPARE KERN_INFO \ -"raid1: spare disk %s\n" - -#define NONE_OPERATIONAL KERN_ERR \ -"raid1: no operational mirrors for md%d\n" - -#define ARRAY_IS_ACTIVE KERN_INFO \ -"raid1: raid set md%d active with %d out of %d mirrors\n" - -#define THREAD_ERROR KERN_ERR \ -"raid1: couldn't allocate thread for md%d\n" - -#define START_RESYNC KERN_WARNING \ -"raid1: raid set md%d not clean; reconstructing mirrors\n" - -static int raid1_run (mddev_t *mddev) -{ - raid1_conf_t *conf; - int i, j, disk_idx; - struct mirror_info *disk; - mdp_super_t *sb = mddev->sb; - mdp_disk_t *descriptor; - mdk_rdev_t *rdev; - struct md_list_head *tmp; - int start_recovery = 0; - - MOD_INC_USE_COUNT; - - if (sb->level != 1) { - printk(INVALID_LEVEL, mdidx(mddev), sb->level); - goto out; - } - /* - * copy the already verified devices into our private RAID1 - * bookkeeping area. [whatever we allocate in raid1_run(), - * should be freed in raid1_stop()] - */ - - conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL); - mddev->private = conf; - if (!conf) { - printk(MEM_ERROR, mdidx(mddev)); - goto out; - } - memset(conf, 0, sizeof(*conf)); - - ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->faulty) { - printk(ERRORS, partition_name(rdev->dev)); - } else { - if (!rdev->sb) { - MD_BUG(); - continue; - } - } - if (rdev->desc_nr == -1) { - MD_BUG(); - continue; - } - descriptor = &sb->disks[rdev->desc_nr]; - disk_idx = descriptor->raid_disk; - disk = conf->mirrors + disk_idx; - - if (disk_faulty(descriptor)) { - disk->number = descriptor->number; - disk->raid_disk = disk_idx; - disk->dev = rdev->dev; - disk->sect_limit = MAX_WORK_PER_DISK; - disk->operational = 0; - disk->write_only = 0; - disk->spare = 0; - disk->used_slot = 1; - disk->head_position = 0; - continue; - } - if (disk_active(descriptor)) { - if (!disk_sync(descriptor)) { - printk(NOT_IN_SYNC, - partition_name(rdev->dev)); - continue; - } - if ((descriptor->number > MD_SB_DISKS) || - (disk_idx > sb->raid_disks)) { - - printk(INCONSISTENT, - partition_name(rdev->dev)); - continue; - } - if (disk->operational) { - printk(ALREADY_RUNNING, - partition_name(rdev->dev), - disk_idx); - continue; - } - printk(OPERATIONAL, partition_name(rdev->dev), - disk_idx); - disk->number = descriptor->number; - disk->raid_disk = disk_idx; - disk->dev = rdev->dev; - disk->sect_limit = MAX_WORK_PER_DISK; - disk->operational = 1; - disk->write_only = 0; - disk->spare = 0; - disk->used_slot = 1; - disk->head_position = 0; - conf->working_disks++; - } else { - /* - * Must be a spare disk .. - */ - printk(SPARE, partition_name(rdev->dev)); - disk->number = descriptor->number; - disk->raid_disk = disk_idx; - disk->dev = rdev->dev; - disk->sect_limit = MAX_WORK_PER_DISK; - disk->operational = 0; - disk->write_only = 0; - disk->spare = 1; - disk->used_slot = 1; - disk->head_position = 0; - } - } - conf->raid_disks = sb->raid_disks; - conf->nr_disks = sb->nr_disks; - conf->mddev = mddev; - conf->device_lock = MD_SPIN_LOCK_UNLOCKED; - - conf->segment_lock = MD_SPIN_LOCK_UNLOCKED; - init_waitqueue_head(&conf->wait_buffer); - init_waitqueue_head(&conf->wait_done); - init_waitqueue_head(&conf->wait_ready); - - if (!conf->working_disks) { - printk(NONE_OPERATIONAL, mdidx(mddev)); - goto out_free_conf; - } - - - /* pre-allocate some buffer_head structures. - * As a minimum, 1 r1bh and raid_disks buffer_heads - * would probably get us by in tight memory situations, - * but a few more is probably a good idea. - * For now, try NR_RESERVED_BUFS r1bh and - * NR_RESERVED_BUFS*raid_disks bufferheads - * This will allow at least NR_RESERVED_BUFS concurrent - * reads or writes even if kmalloc starts failing - */ - if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS || - raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks) - < NR_RESERVED_BUFS*conf->raid_disks) { - printk(MEM_ERROR, mdidx(mddev)); - goto out_free_conf; - } - - for (i = 0; i < MD_SB_DISKS; i++) { - - descriptor = sb->disks+i; - disk_idx = descriptor->raid_disk; - disk = conf->mirrors + disk_idx; - - if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) && - !disk->used_slot) { - - disk->number = descriptor->number; - disk->raid_disk = disk_idx; - disk->dev = MKDEV(0,0); - - disk->operational = 0; - disk->write_only = 0; - disk->spare = 0; - disk->used_slot = 1; - disk->head_position = 0; - } - } - - /* - * find the first working one and use it as a starting point - * to read balancing. - */ - for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++) - /* nothing */; - conf->last_used = j; - - - if (conf->working_disks != sb->raid_disks) { - printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); - start_recovery = 1; - } - - { - const char * name = "raid1d"; - - conf->thread = md_register_thread(raid1d, conf, name); - if (!conf->thread) { - printk(THREAD_ERROR, mdidx(mddev)); - goto out_free_conf; - } - } - - if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) && - (conf->working_disks > 1)) { - const char * name = "raid1syncd"; - - conf->resync_thread = md_register_thread(raid1syncd, conf,name); - if (!conf->resync_thread) { - printk(THREAD_ERROR, mdidx(mddev)); - goto out_free_conf; - } - - printk(START_RESYNC, mdidx(mddev)); - conf->resync_mirrors = 1; - md_wakeup_thread(conf->resync_thread); - } - - /* - * Regenerate the "device is in sync with the raid set" bit for - * each device. - */ - for (i = 0; i < MD_SB_DISKS; i++) { - mark_disk_nonsync(sb->disks+i); - for (j = 0; j < sb->raid_disks; j++) { - if (!conf->mirrors[j].operational) - continue; - if (sb->disks[i].number == conf->mirrors[j].number) - mark_disk_sync(sb->disks+i); - } - } - sb->active_disks = conf->working_disks; - - if (start_recovery) - md_recover_arrays(); - - - printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks); - /* - * Ok, everything is just fine now - */ - return 0; - -out_free_conf: - raid1_shrink_r1bh(conf); - raid1_shrink_bh(conf); - raid1_shrink_buffers(conf); - kfree(conf); - mddev->private = NULL; -out: - MOD_DEC_USE_COUNT; - return -EIO; -} - -#undef INVALID_LEVEL -#undef NO_SB -#undef ERRORS -#undef NOT_IN_SYNC -#undef INCONSISTENT -#undef ALREADY_RUNNING -#undef OPERATIONAL -#undef SPARE -#undef NONE_OPERATIONAL -#undef ARRAY_IS_ACTIVE - -static int raid1_stop_resync (mddev_t *mddev) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - - if (conf->resync_thread) { - if (conf->resync_mirrors) { - conf->resync_mirrors = 2; - md_interrupt_thread(conf->resync_thread); - - printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); - return 1; - } - return 0; - } - return 0; -} - -static int raid1_restart_resync (mddev_t *mddev) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - - if (conf->resync_mirrors) { - if (!conf->resync_thread) { - MD_BUG(); - return 0; - } - conf->resync_mirrors = 1; - md_wakeup_thread(conf->resync_thread); - return 1; - } - return 0; -} - -static int raid1_stop (mddev_t *mddev) -{ - raid1_conf_t *conf = mddev_to_conf(mddev); - - md_unregister_thread(conf->thread); - if (conf->resync_thread) - md_unregister_thread(conf->resync_thread); - raid1_shrink_r1bh(conf); - raid1_shrink_bh(conf); - raid1_shrink_buffers(conf); - kfree(conf); - mddev->private = NULL; - MOD_DEC_USE_COUNT; - return 0; -} - -static mdk_personality_t raid1_personality= -{ - name: "raid1", - make_request: raid1_make_request, - run: raid1_run, - stop: raid1_stop, - status: raid1_status, - error_handler: raid1_error, - diskop: raid1_diskop, - stop_resync: raid1_stop_resync, - restart_resync: raid1_restart_resync, - sync_request: raid1_sync_request -}; - -static int md__init raid1_init (void) -{ - return register_md_personality (RAID1, &raid1_personality); -} - -static void raid1_exit (void) -{ - unregister_md_personality (RAID1); -} - -module_init(raid1_init); -module_exit(raid1_exit); -MODULE_LICENSE("GPL"); +/* + * raid1.c : Multiple Devices driver for Linux + * + * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat + * + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * + * RAID-1 management functions. + * + * Better read-balancing code written by Mika Kuoppala , 2000 + * + * Fixes to reconstruction by Jakob Østergaard" + * Various fixes by Neil Brown + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + +#define MAX_WORK_PER_DISK 128 + +#define NR_RESERVED_BUFS 32 + + +/* + * The following can be used to debug the driver + */ +#define RAID1_DEBUG 0 + +#if RAID1_DEBUG +#define PRINTK(x...) printk(x) +#define inline +#define __inline__ +#else +#define PRINTK(x...) do { } while (0) +#endif + + +static mdk_personality_t raid1_personality; +static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED; +struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail; + +static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt) +{ + /* return a linked list of "cnt" struct buffer_heads. + * don't take any off the free list unless we know we can + * get all we need, otherwise we could deadlock + */ + struct buffer_head *bh=NULL; + + while(cnt) { + struct buffer_head *t; + md_spin_lock_irq(&conf->device_lock); + if (!conf->freebh_blocked && conf->freebh_cnt >= cnt) + while (cnt) { + t = conf->freebh; + conf->freebh = t->b_next; + t->b_next = bh; + bh = t; + t->b_state = 0; + conf->freebh_cnt--; + cnt--; + } + md_spin_unlock_irq(&conf->device_lock); + if (cnt == 0) + break; + t = kmem_cache_alloc(bh_cachep, SLAB_NOIO); + if (t) { + t->b_next = bh; + bh = t; + cnt--; + } else { + PRINTK("raid1: waiting for %d bh\n", cnt); + conf->freebh_blocked = 1; + wait_disk_event(conf->wait_buffer, + !conf->freebh_blocked || + conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2); + conf->freebh_blocked = 0; + } + } + return bh; +} + +static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh) +{ + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + while (bh) { + struct buffer_head *t = bh; + bh=bh->b_next; + if (t->b_pprev == NULL) + kmem_cache_free(bh_cachep, t); + else { + t->b_next= conf->freebh; + conf->freebh = t; + conf->freebh_cnt++; + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); + wake_up(&conf->wait_buffer); +} + +static int raid1_grow_bh(raid1_conf_t *conf, int cnt) +{ + /* allocate cnt buffer_heads, possibly less if kmalloc fails */ + int i = 0; + + while (i < cnt) { + struct buffer_head *bh; + bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL); + if (!bh) break; + + md_spin_lock_irq(&conf->device_lock); + bh->b_pprev = &conf->freebh; + bh->b_next = conf->freebh; + conf->freebh = bh; + conf->freebh_cnt++; + md_spin_unlock_irq(&conf->device_lock); + + i++; + } + return i; +} + +static void raid1_shrink_bh(raid1_conf_t *conf) +{ + /* discard all buffer_heads */ + + md_spin_lock_irq(&conf->device_lock); + while (conf->freebh) { + struct buffer_head *bh = conf->freebh; + conf->freebh = bh->b_next; + kmem_cache_free(bh_cachep, bh); + conf->freebh_cnt--; + } + md_spin_unlock_irq(&conf->device_lock); +} + + +static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf) +{ + struct raid1_bh *r1_bh = NULL; + + do { + md_spin_lock_irq(&conf->device_lock); + if (!conf->freer1_blocked && conf->freer1) { + r1_bh = conf->freer1; + conf->freer1 = r1_bh->next_r1; + conf->freer1_cnt--; + r1_bh->next_r1 = NULL; + r1_bh->state = (1 << R1BH_PreAlloc); + r1_bh->bh_req.b_state = 0; + } + md_spin_unlock_irq(&conf->device_lock); + if (r1_bh) + return r1_bh; + r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO); + if (r1_bh) { + memset(r1_bh, 0, sizeof(*r1_bh)); + return r1_bh; + } + conf->freer1_blocked = 1; + wait_disk_event(conf->wait_buffer, + !conf->freer1_blocked || + conf->freer1_cnt > NR_RESERVED_BUFS/2 + ); + conf->freer1_blocked = 0; + } while (1); +} + +static inline void raid1_free_r1bh(struct raid1_bh *r1_bh) +{ + struct buffer_head *bh = r1_bh->mirror_bh_list; + raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); + + r1_bh->mirror_bh_list = NULL; + + if (test_bit(R1BH_PreAlloc, &r1_bh->state)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + r1_bh->next_r1 = conf->freer1; + conf->freer1 = r1_bh; + conf->freer1_cnt++; + spin_unlock_irqrestore(&conf->device_lock, flags); + /* don't need to wakeup wait_buffer because + * raid1_free_bh below will do that + */ + } else { + kfree(r1_bh); + } + raid1_free_bh(conf, bh); +} + +static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt) +{ + int i = 0; + + while (i < cnt) { + struct raid1_bh *r1_bh; + r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL); + if (!r1_bh) + break; + memset(r1_bh, 0, sizeof(*r1_bh)); + set_bit(R1BH_PreAlloc, &r1_bh->state); + r1_bh->mddev = conf->mddev; + + raid1_free_r1bh(r1_bh); + i++; + } + return i; +} + +static void raid1_shrink_r1bh(raid1_conf_t *conf) +{ + md_spin_lock_irq(&conf->device_lock); + while (conf->freer1) { + struct raid1_bh *r1_bh = conf->freer1; + conf->freer1 = r1_bh->next_r1; + conf->freer1_cnt--; + kfree(r1_bh); + } + md_spin_unlock_irq(&conf->device_lock); +} + + + +static inline void raid1_free_buf(struct raid1_bh *r1_bh) +{ + unsigned long flags; + struct buffer_head *bh = r1_bh->mirror_bh_list; + raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); + r1_bh->mirror_bh_list = NULL; + + spin_lock_irqsave(&conf->device_lock, flags); + r1_bh->next_r1 = conf->freebuf; + conf->freebuf = r1_bh; + spin_unlock_irqrestore(&conf->device_lock, flags); + raid1_free_bh(conf, bh); +} + +static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf) +{ + struct raid1_bh *r1_bh; + + md_spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock); + r1_bh = conf->freebuf; + conf->freebuf = r1_bh->next_r1; + r1_bh->next_r1= NULL; + md_spin_unlock_irq(&conf->device_lock); + + return r1_bh; +} + +static int raid1_grow_buffers (raid1_conf_t *conf, int cnt) +{ + int i = 0; + + md_spin_lock_irq(&conf->device_lock); + while (i < cnt) { + struct raid1_bh *r1_bh; + struct page *page; + + page = alloc_page(GFP_KERNEL); + if (!page) + break; + + r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL); + if (!r1_bh) { + __free_page(page); + break; + } + memset(r1_bh, 0, sizeof(*r1_bh)); + r1_bh->bh_req.b_page = page; + r1_bh->bh_req.b_data = page_address(page); + r1_bh->next_r1 = conf->freebuf; + conf->freebuf = r1_bh; + i++; + } + md_spin_unlock_irq(&conf->device_lock); + return i; +} + +static void raid1_shrink_buffers (raid1_conf_t *conf) +{ + md_spin_lock_irq(&conf->device_lock); + while (conf->freebuf) { + struct raid1_bh *r1_bh = conf->freebuf; + conf->freebuf = r1_bh->next_r1; + __free_page(r1_bh->bh_req.b_page); + kfree(r1_bh); + } + md_spin_unlock_irq(&conf->device_lock); +} + +static int raid1_map (mddev_t *mddev, kdev_t *rdev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + int i, disks = MD_SB_DISKS; + + /* + * Later we do read balancing on the read side + * now we use the first available disk. + */ + + for (i = 0; i < disks; i++) { + if (conf->mirrors[i].operational) { + *rdev = conf->mirrors[i].dev; + return (0); + } + } + + printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n"); + return (-1); +} + +static void raid1_reschedule_retry (struct raid1_bh *r1_bh) +{ + unsigned long flags; + mddev_t *mddev = r1_bh->mddev; + raid1_conf_t *conf = mddev_to_conf(mddev); + + md_spin_lock_irqsave(&retry_list_lock, flags); + if (raid1_retry_list == NULL) + raid1_retry_tail = &raid1_retry_list; + *raid1_retry_tail = r1_bh; + raid1_retry_tail = &r1_bh->next_r1; + r1_bh->next_r1 = NULL; + md_spin_unlock_irqrestore(&retry_list_lock, flags); + md_wakeup_thread(conf->thread); +} + + +static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase) +{ + unsigned long flags; + spin_lock_irqsave(&conf->segment_lock, flags); + if (sector < conf->start_active) + conf->cnt_done--; + else if (sector >= conf->start_future && conf->phase == phase) + conf->cnt_future--; + else if (!--conf->cnt_pending) + wake_up(&conf->wait_ready); + + spin_unlock_irqrestore(&conf->segment_lock, flags); +} + +static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf) +{ + unsigned long flags; + spin_lock_irqsave(&conf->segment_lock, flags); + if (sector >= conf->start_ready) + --conf->cnt_ready; + else if (sector >= conf->start_active) { + if (!--conf->cnt_active) { + conf->start_active = conf->start_ready; + wake_up(&conf->wait_done); + } + } + spin_unlock_irqrestore(&conf->segment_lock, flags); +} + +/* + * raid1_end_bh_io() is called when we have finished servicing a mirrored + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate) +{ + struct buffer_head *bh = r1_bh->master_bh; + + io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev), + test_bit(R1BH_SyncPhase, &r1_bh->state)); + + bh->b_end_io(bh, uptodate); + raid1_free_r1bh(r1_bh); +} +void raid1_end_request (struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + + /* + * this branch is our 'one mirror IO has finished' event handler: + */ + if (!uptodate) + md_error (r1_bh->mddev, bh->b_dev); + else + /* + * Set R1BH_Uptodate in our master buffer_head, so that + * we will return a good error code for to the higher + * levels even if IO on some other mirrored buffer fails. + * + * The 'master' represents the complex operation to + * user-side. So if something waits for IO, then it will + * wait for the 'master' buffer_head. + */ + set_bit (R1BH_Uptodate, &r1_bh->state); + + /* + * We split up the read and write side, imho they are + * conceptually different. + */ + + if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) { + /* + * we have only one buffer_head on the read side + */ + + if (uptodate) { + raid1_end_bh_io(r1_bh, uptodate); + return; + } + /* + * oops, read error: + */ + printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", + partition_name(bh->b_dev), bh->b_blocknr); + raid1_reschedule_retry(r1_bh); + return; + } + + /* + * WRITE: + * + * Let's see if all mirrored write operations have finished + * already. + */ + + if (atomic_dec_and_test(&r1_bh->remaining)) + raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state)); +} + +/* + * This routine returns the disk from which the requested read should + * be done. It bookkeeps the last read position for every disk + * in array and when new read requests come, the disk which last + * position is nearest to the request, is chosen. + * + * TODO: now if there are 2 mirrors in the same 2 devices, performance + * degrades dramatically because position is mirror, not device based. + * This should be changed to be device based. Also atomic sequential + * reads should be somehow balanced. + */ + +static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh) +{ + int new_disk = conf->last_used; + const int sectors = bh->b_size >> 9; + const unsigned long this_sector = bh->b_rsector; + int disk = new_disk; + unsigned long new_distance; + unsigned long current_distance; + + /* + * Check if it is sane at all to balance + */ + + if (conf->resync_mirrors) + goto rb_out; + + + /* make sure that disk is operational */ + while( !conf->mirrors[new_disk].operational) { + if (new_disk <= 0) new_disk = conf->raid_disks; + new_disk--; + if (new_disk == disk) { + /* + * This means no working disk was found + * Nothing much to do, lets not change anything + * and hope for the best... + */ + + new_disk = conf->last_used; + + goto rb_out; + } + } + disk = new_disk; + /* now disk == new_disk == starting point for search */ + + /* + * Don't touch anything for sequential reads. + */ + + if (this_sector == conf->mirrors[new_disk].head_position) + goto rb_out; + + /* + * If reads have been done only on a single disk + * for a time, lets give another disk a change. + * This is for kicking those idling disks so that + * they would find work near some hotspot. + */ + + if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) { + conf->sect_count = 0; + + do { + if (new_disk<=0) + new_disk = conf->raid_disks; + new_disk--; + if (new_disk == disk) + break; + } while ((conf->mirrors[new_disk].write_only) || + (!conf->mirrors[new_disk].operational)); + + goto rb_out; + } + + current_distance = abs(this_sector - + conf->mirrors[disk].head_position); + + /* Find the disk which is closest */ + + do { + if (disk <= 0) + disk = conf->raid_disks; + disk--; + + if ((conf->mirrors[disk].write_only) || + (!conf->mirrors[disk].operational)) + continue; + + new_distance = abs(this_sector - + conf->mirrors[disk].head_position); + + if (new_distance < current_distance) { + conf->sect_count = 0; + current_distance = new_distance; + new_disk = disk; + } + } while (disk != conf->last_used); + +rb_out: + conf->mirrors[new_disk].head_position = this_sector + sectors; + + conf->last_used = new_disk; + conf->sect_count += sectors; + + return new_disk; +} + +static int raid1_make_request (mddev_t *mddev, int rw, + struct buffer_head * bh) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct buffer_head *bh_req, *bhl; + struct raid1_bh * r1_bh; + int disks = MD_SB_DISKS; + int i, sum_bhs = 0; + struct mirror_info *mirror; + + if (!buffer_locked(bh)) + BUG(); + +/* + * make_request() can abort the operation when READA is being + * used and no empty request is available. + * + * Currently, just replace the command with READ/WRITE. + */ + if (rw == READA) + rw = READ; + + r1_bh = raid1_alloc_r1bh (conf); + + spin_lock_irq(&conf->segment_lock); + wait_event_lock_irq(conf->wait_done, + bh->b_rsector < conf->start_active || + bh->b_rsector >= conf->start_future, + conf->segment_lock); + if (bh->b_rsector < conf->start_active) + conf->cnt_done++; + else { + conf->cnt_future++; + if (conf->phase) + set_bit(R1BH_SyncPhase, &r1_bh->state); + } + spin_unlock_irq(&conf->segment_lock); + + /* + * i think the read and write branch should be separated completely, + * since we want to do read balancing on the read side for example. + * Alternative implementations? :) --mingo + */ + + r1_bh->master_bh = bh; + r1_bh->mddev = mddev; + r1_bh->cmd = rw; + + if (rw == READ) { + /* + * read balancing logic: + */ + mirror = conf->mirrors + raid1_read_balance(conf, bh); + + bh_req = &r1_bh->bh_req; + memcpy(bh_req, bh, sizeof(*bh)); + bh_req->b_blocknr = bh->b_rsector; + bh_req->b_dev = mirror->dev; + bh_req->b_rdev = mirror->dev; + /* bh_req->b_rsector = bh->n_rsector; */ + bh_req->b_end_io = raid1_end_request; + bh_req->b_private = r1_bh; + generic_make_request (rw, bh_req); + return 0; + } + + /* + * WRITE: + */ + + bhl = raid1_alloc_bh(conf, conf->raid_disks); + for (i = 0; i < disks; i++) { + struct buffer_head *mbh; + if (!conf->mirrors[i].operational) + continue; + + /* + * We should use a private pool (size depending on NR_REQUEST), + * to avoid writes filling up the memory with bhs + * + * Such pools are much faster than kmalloc anyways (so we waste + * almost nothing by not using the master bh when writing and + * win alot of cleanness) but for now we are cool enough. --mingo + * + * It's safe to sleep here, buffer heads cannot be used in a shared + * manner in the write branch. Look how we lock the buffer at the + * beginning of this function to grok the difference ;) + */ + mbh = bhl; + if (mbh == NULL) { + MD_BUG(); + break; + } + bhl = mbh->b_next; + mbh->b_next = NULL; + mbh->b_this_page = (struct buffer_head *)1; + + /* + * prepare mirrored mbh (fields ordered for max mem throughput): + */ + mbh->b_blocknr = bh->b_rsector; + mbh->b_dev = conf->mirrors[i].dev; + mbh->b_rdev = conf->mirrors[i].dev; + mbh->b_rsector = bh->b_rsector; + mbh->b_state = (1<b_count, 1); + mbh->b_size = bh->b_size; + mbh->b_page = bh->b_page; + mbh->b_data = bh->b_data; + mbh->b_list = BUF_LOCKED; + mbh->b_end_io = raid1_end_request; + mbh->b_private = r1_bh; + + mbh->b_next = r1_bh->mirror_bh_list; + r1_bh->mirror_bh_list = mbh; + sum_bhs++; + } + if (bhl) raid1_free_bh(conf,bhl); + if (!sum_bhs) { + /* Gag - all mirrors non-operational.. */ + raid1_end_bh_io(r1_bh, 0); + return 0; + } + md_atomic_set(&r1_bh->remaining, sum_bhs); + + /* + * We have to be a bit careful about the semaphore above, thats + * why we start the requests separately. Since kmalloc() could + * fail, sleep and make_request() can sleep too, this is the + * safer solution. Imagine, end_request decreasing the semaphore + * before we could have set it up ... We could play tricks with + * the semaphore (presetting it and correcting at the end if + * sum_bhs is not 'n' but we have to do end_request by hand if + * all requests finish until we had a chance to set up the + * semaphore correctly ... lots of races). + */ + bh = r1_bh->mirror_bh_list; + while(bh) { + struct buffer_head *bh2 = bh; + bh = bh->b_next; + generic_make_request(rw, bh2); + } + return (0); +} + +static int raid1_status (char *page, mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + int sz = 0, i; + + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, + conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + sz += sprintf (page+sz, "%s", + conf->mirrors[i].operational ? "U" : "_"); + sz += sprintf (page+sz, "]"); + return sz; +} + +#define LAST_DISK KERN_ALERT \ +"raid1: only one disk left and IO error.\n" + +#define NO_SPARE_DISK KERN_ALERT \ +"raid1: no spare disk left, degrading mirror level by one.\n" + +#define DISK_FAILED KERN_ALERT \ +"raid1: Disk failure on %s, disabling device. \n" \ +" Operation continuing on %d devices\n" + +#define START_SYNCING KERN_ALERT \ +"raid1: start syncing spare disk.\n" + +#define ALREADY_SYNCING KERN_INFO \ +"raid1: syncing already in progress.\n" + +static void mark_disk_bad (mddev_t *mddev, int failed) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror = conf->mirrors+failed; + mdp_super_t *sb = mddev->sb; + + mirror->operational = 0; + mark_disk_faulty(sb->disks+mirror->number); + mark_disk_nonsync(sb->disks+mirror->number); + mark_disk_inactive(sb->disks+mirror->number); + if (!mirror->write_only) + sb->active_disks--; + sb->working_disks--; + sb->failed_disks++; + mddev->sb_dirty = 1; + md_wakeup_thread(conf->thread); + if (!mirror->write_only) + conf->working_disks--; + printk (DISK_FAILED, partition_name (mirror->dev), + conf->working_disks); +} + +static int raid1_error (mddev_t *mddev, kdev_t dev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info * mirrors = conf->mirrors; + int disks = MD_SB_DISKS; + int i; + + /* Find the drive. + * If it is not operational, then we have already marked it as dead + * else if it is the last working disks, ignore the error, let the + * next level up know. + * else mark the drive as failed + */ + + for (i = 0; i < disks; i++) + if (mirrors[i].dev==dev && mirrors[i].operational) + break; + if (i == disks) + return 0; + + if (i < conf->raid_disks && conf->working_disks == 1) { + /* Don't fail the drive, act as though we were just a + * normal single drive + */ + + return 1; + } + mark_disk_bad(mddev, i); + return 0; +} + +#undef LAST_DISK +#undef NO_SPARE_DISK +#undef DISK_FAILED +#undef START_SYNCING + + +static void print_raid1_conf (raid1_conf_t *conf) +{ + int i; + struct mirror_info *tmp; + + printk("RAID1 conf printout:\n"); + if (!conf) { + printk("(conf==NULL)\n"); + return; + } + printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks, + conf->raid_disks, conf->nr_disks); + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", + i, tmp->spare,tmp->operational, + tmp->number,tmp->raid_disk,tmp->used_slot, + partition_name(tmp->dev)); + } +} + +static void close_sync(raid1_conf_t *conf) +{ + mddev_t *mddev = conf->mddev; + /* If reconstruction was interrupted, we need to close the "active" and "pending" + * holes. + * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0 + */ + /* this is really needed when recovery stops too... */ + spin_lock_irq(&conf->segment_lock); + conf->start_active = conf->start_pending; + conf->start_ready = conf->start_pending; + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); + conf->start_active =conf->start_ready = conf->start_pending = conf->start_future; + conf->start_future = mddev->sb->size+1; + conf->cnt_pending = conf->cnt_future; + conf->cnt_future = 0; + conf->phase = conf->phase ^1; + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); + conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0; + conf->phase = 0; + conf->cnt_future = conf->cnt_done;; + conf->cnt_done = 0; + spin_unlock_irq(&conf->segment_lock); + wake_up(&conf->wait_done); +} + +static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state) +{ + int err = 0; + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; + raid1_conf_t *conf = mddev->private; + struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *failed_desc, *spare_desc, *added_desc; + mdk_rdev_t *spare_rdev, *failed_rdev; + + print_raid1_conf(conf); + md_spin_lock_irq(&conf->device_lock); + /* + * find the disk ... + */ + switch (state) { + + case DISKOP_SPARE_ACTIVE: + + /* + * Find the failed disk within the RAID1 configuration ... + * (this can only be in the first conf->working_disks part) + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->mirrors + i; + if ((!tmp->operational && !tmp->spare) || + !tmp->used_slot) { + failed_disk = i; + break; + } + } + /* + * When we activate a spare disk we _must_ have a disk in + * the lower (active) part of the array to replace. + */ + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + /* fall through */ + + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + + /* + * Find the spare disk ... (can only be in the 'high' + * area of the array) + */ + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_DISK: + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (tmp->operational) { + err = -EBUSY; + goto abort; + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_ADD_DISK: + + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (!tmp->used_slot) { + added_disk = i; + break; + } + } + if (added_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + } + + switch (state) { + /* + * Switch the spare disk to write-only mode: + */ + case DISKOP_SPARE_WRITE: + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 1; + sdisk->write_only = 1; + break; + /* + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: + close_sync(conf); + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; + break; + /* + * Activate (mark read-write) the (now sync) spare disk, + * which means we switch it's 'raid position' (->raid_disk) + * with the failed disk. (only the first 'conf->nr_disks' + * slots are used for 'real' disks and we must preserve this + * property) + */ + case DISKOP_SPARE_ACTIVE: + close_sync(conf); + sdisk = conf->mirrors + spare_disk; + fdisk = conf->mirrors + failed_disk; + + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; + + if (spare_desc != *d) { + MD_BUG(); + err = 1; + goto abort; + } + + if (spare_desc->raid_disk != sdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (sdisk->raid_disk != spare_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + /* + * do the switch finally + */ + spare_rdev = find_rdev_nr(mddev, spare_desc->number); + failed_rdev = find_rdev_nr(mddev, failed_desc->number); + + /* There must be a spare_rdev, but there may not be a + * failed_rdev. That slot might be empty... + */ + spare_rdev->desc_nr = failed_desc->number; + if (failed_rdev) + failed_rdev->desc_nr = spare_desc->number; + + xchg_values(*spare_desc, *failed_desc); + xchg_values(*fdisk, *sdisk); + + /* + * (careful, 'failed' and 'spare' are switched from now on) + * + * we want to preserve linear numbering and we want to + * give the proper raid_disk number to the now activated + * disk. (this means we switch back these values) + */ + + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); + xchg_values(sdisk->raid_disk, fdisk->raid_disk); + xchg_values(spare_desc->number, failed_desc->number); + xchg_values(sdisk->number, fdisk->number); + + *d = failed_desc; + + if (sdisk->dev == MKDEV(0,0)) + sdisk->used_slot = 0; + /* + * this really activates the spare. + */ + fdisk->spare = 0; + fdisk->write_only = 0; + + /* + * if we activate a spare, we definitely replace a + * non-operational disk slot in the 'low' area of + * the disk array. + */ + + conf->working_disks++; + + break; + + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->mirrors + removed_disk; + + if (rdisk->spare && (removed_disk < conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + rdisk->dev = MKDEV(0,0); + rdisk->used_slot = 0; + conf->nr_disks--; + break; + + case DISKOP_HOT_ADD_DISK: + adisk = conf->mirrors + added_disk; + added_desc = *d; + + if (added_disk != added_desc->number) { + MD_BUG(); + err = 1; + goto abort; + } + + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = MKDEV(added_desc->major,added_desc->minor); + + adisk->operational = 0; + adisk->write_only = 0; + adisk->spare = 1; + adisk->used_slot = 1; + adisk->head_position = 0; + conf->nr_disks++; + + break; + + default: + MD_BUG(); + err = 1; + goto abort; + } +abort: + md_spin_unlock_irq(&conf->device_lock); + if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) + /* should move to "END_REBUILD" when such exists */ + raid1_shrink_buffers(conf); + + print_raid1_conf(conf); + return err; +} + + +#define IO_ERROR KERN_ALERT \ +"raid1: %s: unrecoverable I/O read error for block %lu\n" + +#define REDIRECT_SECTOR KERN_ERR \ +"raid1: %s: redirecting sector %lu to another mirror\n" + +/* + * This is a kernel thread which: + * + * 1. Retries failed read operations on working mirrors. + * 2. Updates the raid superblock when problems encounter. + * 3. Performs writes following reads for array syncronising. + */ +static void end_sync_write(struct buffer_head *bh, int uptodate); +static void end_sync_read(struct buffer_head *bh, int uptodate); + +static void raid1d (void *data) +{ + struct raid1_bh *r1_bh; + struct buffer_head *bh; + unsigned long flags; + mddev_t *mddev; + kdev_t dev; + + + for (;;) { + md_spin_lock_irqsave(&retry_list_lock, flags); + r1_bh = raid1_retry_list; + if (!r1_bh) + break; + raid1_retry_list = r1_bh->next_r1; + md_spin_unlock_irqrestore(&retry_list_lock, flags); + + mddev = r1_bh->mddev; + if (mddev->sb_dirty) { + printk(KERN_INFO "raid1: dirty sb detected, updating.\n"); + mddev->sb_dirty = 0; + md_update_sb(mddev); + } + bh = &r1_bh->bh_req; + switch(r1_bh->cmd) { + case SPECIAL: + /* have to allocate lots of bh structures and + * schedule writes + */ + if (test_bit(R1BH_Uptodate, &r1_bh->state)) { + int i, sum_bhs = 0; + int disks = MD_SB_DISKS; + struct buffer_head *bhl, *mbh; + raid1_conf_t *conf; + + conf = mddev_to_conf(mddev); + bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */ + for (i = 0; i < disks ; i++) { + if (!conf->mirrors[i].operational) + continue; + if (i==conf->last_used) + /* we read from here, no need to write */ + continue; + if (i < conf->raid_disks + && !conf->resync_mirrors) + /* don't need to write this, + * we are just rebuilding */ + continue; + mbh = bhl; + if (!mbh) { + MD_BUG(); + break; + } + bhl = mbh->b_next; + mbh->b_this_page = (struct buffer_head *)1; + + + /* + * prepare mirrored bh (fields ordered for max mem throughput): + */ + mbh->b_blocknr = bh->b_blocknr; + mbh->b_dev = conf->mirrors[i].dev; + mbh->b_rdev = conf->mirrors[i].dev; + mbh->b_rsector = bh->b_blocknr; + mbh->b_state = (1<b_count, 1); + mbh->b_size = bh->b_size; + mbh->b_page = bh->b_page; + mbh->b_data = bh->b_data; + mbh->b_list = BUF_LOCKED; + mbh->b_end_io = end_sync_write; + mbh->b_private = r1_bh; + + mbh->b_next = r1_bh->mirror_bh_list; + r1_bh->mirror_bh_list = mbh; + + sum_bhs++; + } + md_atomic_set(&r1_bh->remaining, sum_bhs); + if (bhl) raid1_free_bh(conf, bhl); + mbh = r1_bh->mirror_bh_list; + + if (!sum_bhs) { + /* nowhere to write this too... I guess we + * must be done + */ + sync_request_done(bh->b_blocknr, conf); + md_done_sync(mddev, bh->b_size>>9, 0); + raid1_free_buf(r1_bh); + } else + while (mbh) { + struct buffer_head *bh1 = mbh; + mbh = mbh->b_next; + generic_make_request(WRITE, bh1); + md_sync_acct(bh1->b_dev, bh1->b_size/512); + } + } else { + /* There is no point trying a read-for-reconstruct + * as reconstruct is about to be aborted + */ + + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); + md_done_sync(mddev, bh->b_size>>9, 0); + } + + break; + case READ: + case READA: + dev = bh->b_dev; + raid1_map (mddev, &bh->b_dev); + if (bh->b_dev == dev) { + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); + raid1_end_bh_io(r1_bh, 0); + } else { + printk (REDIRECT_SECTOR, + partition_name(bh->b_dev), bh->b_blocknr); + bh->b_rdev = bh->b_dev; + bh->b_rsector = bh->b_blocknr; + generic_make_request (r1_bh->cmd, bh); + } + break; + } + } + md_spin_unlock_irqrestore(&retry_list_lock, flags); +} +#undef IO_ERROR +#undef REDIRECT_SECTOR + +/* + * Private kernel thread to reconstruct mirrors after an unclean + * shutdown. + */ +static void raid1syncd (void *data) +{ + raid1_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + + if (!conf->resync_mirrors) + return; + if (conf->resync_mirrors == 2) + return; + down(&mddev->recovery_sem); + if (!md_do_sync(mddev, NULL)) { + /* + * Only if everything went Ok. + */ + conf->resync_mirrors = 0; + } + + close_sync(conf); + + up(&mddev->recovery_sem); + raid1_shrink_buffers(conf); +} + +/* + * perform a "sync" on one "block" + * + * We need to make sure that no normal I/O request - particularly write + * requests - conflict with active sync requests. + * This is achieved by conceptually dividing the device space into a + * number of sections: + * DONE: 0 .. a-1 These blocks are in-sync + * ACTIVE: a.. b-1 These blocks may have active sync requests, but + * no normal IO requests + * READY: b .. c-1 These blocks have no normal IO requests - sync + * request may be happening + * PENDING: c .. d-1 These blocks may have IO requests, but no new + * ones will be added + * FUTURE: d .. end These blocks are not to be considered yet. IO may + * be happening, but not sync + * + * We keep a + * phase which flips (0 or 1) each time d moves and + * a count of: + * z = active io requests in FUTURE since d moved - marked with + * current phase + * y = active io requests in FUTURE before d moved, or PENDING - + * marked with previous phase + * x = active sync requests in READY + * w = active sync requests in ACTIVE + * v = active io requests in DONE + * + * Normally, a=b=c=d=0 and z= active io requests + * or a=b=c=d=END and v= active io requests + * Allowed changes to a,b,c,d: + * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase + * B: y==0 -> c=d + * C: b=c, w+=x, x=0 + * D: w==0 -> a=b + * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0 + * + * At start of sync we apply A. + * When y reaches 0, we apply B then A then being sync requests + * When sync point reaches c-1, we wait for y==0, and W==0, and + * then apply apply B then A then D then C. + * Finally, we apply E + * + * The sync request simply issues a "read" against a working drive + * This is marked so that on completion the raid1d thread is woken to + * issue suitable write requests + */ + +static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror; + struct raid1_bh *r1_bh; + struct buffer_head *bh; + int bsize; + int disk; + int block_nr; + + spin_lock_irq(&conf->segment_lock); + if (!sector_nr) { + /* initialize ...*/ + int buffs; + conf->start_active = 0; + conf->start_ready = 0; + conf->start_pending = 0; + conf->start_future = 0; + conf->phase = 0; + /* we want enough buffers to hold twice the window of 128*/ + buffs = 128 *2 / (PAGE_SIZE>>9); + buffs = raid1_grow_buffers(conf, buffs); + if (buffs < 2) + goto nomem; + + conf->window = buffs*(PAGE_SIZE>>9)/2; + conf->cnt_future += conf->cnt_done+conf->cnt_pending; + conf->cnt_done = conf->cnt_pending = 0; + if (conf->cnt_ready || conf->cnt_active) + MD_BUG(); + } + while (sector_nr >= conf->start_pending) { + PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n", + sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future, + conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future); + wait_event_lock_irq(conf->wait_done, + !conf->cnt_active, + conf->segment_lock); + wait_event_lock_irq(conf->wait_ready, + !conf->cnt_pending, + conf->segment_lock); + conf->start_active = conf->start_ready; + conf->start_ready = conf->start_pending; + conf->start_pending = conf->start_future; + conf->start_future = conf->start_future+conf->window; + // Note: falling off the end is not a problem + conf->phase = conf->phase ^1; + conf->cnt_active = conf->cnt_ready; + conf->cnt_ready = 0; + conf->cnt_pending = conf->cnt_future; + conf->cnt_future = 0; + wake_up(&conf->wait_done); + } + conf->cnt_ready++; + spin_unlock_irq(&conf->segment_lock); + + + /* If reconstructing, and >1 working disc, + * could dedicate one to rebuild and others to + * service read requests .. + */ + disk = conf->last_used; + /* make sure disk is operational */ + while (!conf->mirrors[disk].operational) { + if (disk <= 0) disk = conf->raid_disks; + disk--; + if (disk == conf->last_used) + break; + } + conf->last_used = disk; + + mirror = conf->mirrors+conf->last_used; + + r1_bh = raid1_alloc_buf (conf); + r1_bh->master_bh = NULL; + r1_bh->mddev = mddev; + r1_bh->cmd = SPECIAL; + bh = &r1_bh->bh_req; + + block_nr = sector_nr; + bsize = 512; + while (!(block_nr & 1) && bsize < PAGE_SIZE + && (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) { + block_nr >>= 1; + bsize <<= 1; + } + bh->b_size = bsize; + bh->b_list = BUF_LOCKED; + bh->b_dev = mirror->dev; + bh->b_rdev = mirror->dev; + bh->b_state = (1<b_page) + BUG(); + if (!bh->b_data) + BUG(); + if (bh->b_data != page_address(bh->b_page)) + BUG(); + bh->b_end_io = end_sync_read; + bh->b_private = r1_bh; + bh->b_blocknr = sector_nr; + bh->b_rsector = sector_nr; + init_waitqueue_head(&bh->b_wait); + + generic_make_request(READ, bh); + md_sync_acct(bh->b_dev, bh->b_size/512); + + return (bsize >> 9); + +nomem: + raid1_shrink_buffers(conf); + spin_unlock_irq(&conf->segment_lock); + return -ENOMEM; +} + +static void end_sync_read(struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + + /* we have read a block, now it needs to be re-written, + * or re-read if the read failed. + * We don't do much here, just schedule handling by raid1d + */ + if (!uptodate) + md_error (r1_bh->mddev, bh->b_dev); + else + set_bit(R1BH_Uptodate, &r1_bh->state); + raid1_reschedule_retry(r1_bh); +} + +static void end_sync_write(struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + + if (!uptodate) + md_error (r1_bh->mddev, bh->b_dev); + if (atomic_dec_and_test(&r1_bh->remaining)) { + mddev_t *mddev = r1_bh->mddev; + unsigned long sect = bh->b_blocknr; + int size = bh->b_size; + raid1_free_buf(r1_bh); + sync_request_done(sect, mddev_to_conf(mddev)); + md_done_sync(mddev,size>>9, uptodate); + } +} + +#define INVALID_LEVEL KERN_WARNING \ +"raid1: md%d: raid level not set to mirroring (%d)\n" + +#define NO_SB KERN_ERR \ +"raid1: disabled mirror %s (couldn't access raid superblock)\n" + +#define ERRORS KERN_ERR \ +"raid1: disabled mirror %s (errors detected)\n" + +#define NOT_IN_SYNC KERN_ERR \ +"raid1: disabled mirror %s (not in sync)\n" + +#define INCONSISTENT KERN_ERR \ +"raid1: disabled mirror %s (inconsistent descriptor)\n" + +#define ALREADY_RUNNING KERN_ERR \ +"raid1: disabled mirror %s (mirror %d already operational)\n" + +#define OPERATIONAL KERN_INFO \ +"raid1: device %s operational as mirror %d\n" + +#define MEM_ERROR KERN_ERR \ +"raid1: couldn't allocate memory for md%d\n" + +#define SPARE KERN_INFO \ +"raid1: spare disk %s\n" + +#define NONE_OPERATIONAL KERN_ERR \ +"raid1: no operational mirrors for md%d\n" + +#define ARRAY_IS_ACTIVE KERN_INFO \ +"raid1: raid set md%d active with %d out of %d mirrors\n" + +#define THREAD_ERROR KERN_ERR \ +"raid1: couldn't allocate thread for md%d\n" + +#define START_RESYNC KERN_WARNING \ +"raid1: raid set md%d not clean; reconstructing mirrors\n" + +static int raid1_run (mddev_t *mddev) +{ + raid1_conf_t *conf; + int i, j, disk_idx; + struct mirror_info *disk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *descriptor; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int start_recovery = 0; + + MOD_INC_USE_COUNT; + + if (sb->level != 1) { + printk(INVALID_LEVEL, mdidx(mddev), sb->level); + goto out; + } + /* + * copy the already verified devices into our private RAID1 + * bookkeeping area. [whatever we allocate in raid1_run(), + * should be freed in raid1_stop()] + */ + + conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL); + mddev->private = conf; + if (!conf) { + printk(MEM_ERROR, mdidx(mddev)); + goto out; + } + memset(conf, 0, sizeof(*conf)); + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + printk(ERRORS, partition_name(rdev->dev)); + } else { + if (!rdev->sb) { + MD_BUG(); + continue; + } + } + if (rdev->desc_nr == -1) { + MD_BUG(); + continue; + } + descriptor = &sb->disks[rdev->desc_nr]; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor)) { + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_WORK_PER_DISK; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + disk->head_position = 0; + continue; + } + if (disk_active(descriptor)) { + if (!disk_sync(descriptor)) { + printk(NOT_IN_SYNC, + partition_name(rdev->dev)); + continue; + } + if ((descriptor->number > MD_SB_DISKS) || + (disk_idx > sb->raid_disks)) { + + printk(INCONSISTENT, + partition_name(rdev->dev)); + continue; + } + if (disk->operational) { + printk(ALREADY_RUNNING, + partition_name(rdev->dev), + disk_idx); + continue; + } + printk(OPERATIONAL, partition_name(rdev->dev), + disk_idx); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_WORK_PER_DISK; + disk->operational = 1; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + disk->head_position = 0; + conf->working_disks++; + } else { + /* + * Must be a spare disk .. + */ + printk(SPARE, partition_name(rdev->dev)); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_WORK_PER_DISK; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 1; + disk->used_slot = 1; + disk->head_position = 0; + } + } + conf->raid_disks = sb->raid_disks; + conf->nr_disks = sb->nr_disks; + conf->mddev = mddev; + conf->device_lock = MD_SPIN_LOCK_UNLOCKED; + + conf->segment_lock = MD_SPIN_LOCK_UNLOCKED; + init_waitqueue_head(&conf->wait_buffer); + init_waitqueue_head(&conf->wait_done); + init_waitqueue_head(&conf->wait_ready); + + if (!conf->working_disks) { + printk(NONE_OPERATIONAL, mdidx(mddev)); + goto out_free_conf; + } + + + /* pre-allocate some buffer_head structures. + * As a minimum, 1 r1bh and raid_disks buffer_heads + * would probably get us by in tight memory situations, + * but a few more is probably a good idea. + * For now, try NR_RESERVED_BUFS r1bh and + * NR_RESERVED_BUFS*raid_disks bufferheads + * This will allow at least NR_RESERVED_BUFS concurrent + * reads or writes even if kmalloc starts failing + */ + if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS || + raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks) + < NR_RESERVED_BUFS*conf->raid_disks) { + printk(MEM_ERROR, mdidx(mddev)); + goto out_free_conf; + } + + for (i = 0; i < MD_SB_DISKS; i++) { + + descriptor = sb->disks+i; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) && + !disk->used_slot) { + + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = MKDEV(0,0); + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + disk->head_position = 0; + } + } + + /* + * find the first working one and use it as a starting point + * to read balancing. + */ + for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++) + /* nothing */; + conf->last_used = j; + + + if (conf->working_disks != sb->raid_disks) { + printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); + start_recovery = 1; + } + + { + const char * name = "raid1d"; + + conf->thread = md_register_thread(raid1d, conf, name); + if (!conf->thread) { + printk(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + } + + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) && + (conf->working_disks > 1)) { + const char * name = "raid1syncd"; + + conf->resync_thread = md_register_thread(raid1syncd, conf,name); + if (!conf->resync_thread) { + printk(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + + printk(START_RESYNC, mdidx(mddev)); + conf->resync_mirrors = 1; + md_wakeup_thread(conf->resync_thread); + } + + /* + * Regenerate the "device is in sync with the raid set" bit for + * each device. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + mark_disk_nonsync(sb->disks+i); + for (j = 0; j < sb->raid_disks; j++) { + if (!conf->mirrors[j].operational) + continue; + if (sb->disks[i].number == conf->mirrors[j].number) + mark_disk_sync(sb->disks+i); + } + } + sb->active_disks = conf->working_disks; + + if (start_recovery) + md_recover_arrays(); + + + printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks); + /* + * Ok, everything is just fine now + */ + return 0; + +out_free_conf: + raid1_shrink_r1bh(conf); + raid1_shrink_bh(conf); + raid1_shrink_buffers(conf); + kfree(conf); + mddev->private = NULL; +out: + MOD_DEC_USE_COUNT; + return -EIO; +} + +#undef INVALID_LEVEL +#undef NO_SB +#undef ERRORS +#undef NOT_IN_SYNC +#undef INCONSISTENT +#undef ALREADY_RUNNING +#undef OPERATIONAL +#undef SPARE +#undef NONE_OPERATIONAL +#undef ARRAY_IS_ACTIVE + +static int raid1_stop_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_thread) { + if (conf->resync_mirrors) { + conf->resync_mirrors = 2; + md_interrupt_thread(conf->resync_thread); + + printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); + return 1; + } + return 0; + } + return 0; +} + +static int raid1_restart_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_mirrors) { + if (!conf->resync_thread) { + MD_BUG(); + return 0; + } + conf->resync_mirrors = 1; + md_wakeup_thread(conf->resync_thread); + return 1; + } + return 0; +} + +static int raid1_stop (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + md_unregister_thread(conf->thread); + if (conf->resync_thread) + md_unregister_thread(conf->resync_thread); + raid1_shrink_r1bh(conf); + raid1_shrink_bh(conf); + raid1_shrink_buffers(conf); + kfree(conf); + mddev->private = NULL; + MOD_DEC_USE_COUNT; + return 0; +} + +static mdk_personality_t raid1_personality= +{ + name: "raid1", + make_request: raid1_make_request, + run: raid1_run, + stop: raid1_stop, + status: raid1_status, + error_handler: raid1_error, + diskop: raid1_diskop, + stop_resync: raid1_stop_resync, + restart_resync: raid1_restart_resync, + sync_request: raid1_sync_request +}; + +static int md__init raid1_init (void) +{ + return register_md_personality (RAID1, &raid1_personality); +} + +static void raid1_exit (void) +{ + unregister_md_personality (RAID1); +} + +module_init(raid1_init); +module_exit(raid1_exit); +MODULE_LICENSE("GPL");