/* * raid1.c : Multiple Devices driver for Linux * * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat * * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman * * RAID-1 management functions. * * Better read-balancing code written by Mika Kuoppala , 2000 * * Fixes to reconstruction by Jakob Østergaard" * Various fixes by Neil Brown * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * You should have received a copy of the GNU General Public License * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include #include #include #include #define MAJOR_NR MD_MAJOR #define MD_DRIVER #define MD_PERSONALITY #define MAX_WORK_PER_DISK 128 #define NR_RESERVED_BUFS 32 /* * The following can be used to debug the driver */ #define RAID1_DEBUG 0 #if RAID1_DEBUG #define PRINTK(x...) printk(x) #define inline #define __inline__ #else #define PRINTK(x...) do { } while (0) #endif static mdk_personality_t raid1_personality; static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED; struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail; static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt) { /* return a linked list of "cnt" struct buffer_heads. * don't take any off the free list unless we know we can * get all we need, otherwise we could deadlock */ struct buffer_head *bh=NULL; while(cnt) { struct buffer_head *t; md_spin_lock_irq(&conf->device_lock); if (!conf->freebh_blocked && conf->freebh_cnt >= cnt) while (cnt) { t = conf->freebh; conf->freebh = t->b_next; t->b_next = bh; bh = t; t->b_state = 0; conf->freebh_cnt--; cnt--; } md_spin_unlock_irq(&conf->device_lock); if (cnt == 0) break; t = kmem_cache_alloc(bh_cachep, SLAB_NOIO); if (t) { t->b_next = bh; bh = t; cnt--; } else { PRINTK("raid1: waiting for %d bh\n", cnt); conf->freebh_blocked = 1; wait_disk_event(conf->wait_buffer, !conf->freebh_blocked || conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2); conf->freebh_blocked = 0; } } return bh; } static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh) { unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); while (bh) { struct buffer_head *t = bh; bh=bh->b_next; if (t->b_pprev == NULL) kmem_cache_free(bh_cachep, t); else { t->b_next= conf->freebh; conf->freebh = t; conf->freebh_cnt++; } } spin_unlock_irqrestore(&conf->device_lock, flags); wake_up(&conf->wait_buffer); } static int raid1_grow_bh(raid1_conf_t *conf, int cnt) { /* allocate cnt buffer_heads, possibly less if kmalloc fails */ int i = 0; while (i < cnt) { struct buffer_head *bh; bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL); if (!bh) break; md_spin_lock_irq(&conf->device_lock); bh->b_pprev = &conf->freebh; bh->b_next = conf->freebh; conf->freebh = bh; conf->freebh_cnt++; md_spin_unlock_irq(&conf->device_lock); i++; } return i; } static void raid1_shrink_bh(raid1_conf_t *conf) { /* discard all buffer_heads */ md_spin_lock_irq(&conf->device_lock); while (conf->freebh) { struct buffer_head *bh = conf->freebh; conf->freebh = bh->b_next; kmem_cache_free(bh_cachep, bh); conf->freebh_cnt--; } md_spin_unlock_irq(&conf->device_lock); } static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf) { struct raid1_bh *r1_bh = NULL; do { md_spin_lock_irq(&conf->device_lock); if (!conf->freer1_blocked && conf->freer1) { r1_bh = conf->freer1; conf->freer1 = r1_bh->next_r1; conf->freer1_cnt--; r1_bh->next_r1 = NULL; r1_bh->state = (1 << R1BH_PreAlloc); r1_bh->bh_req.b_state = 0; } md_spin_unlock_irq(&conf->device_lock); if (r1_bh) return r1_bh; r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO); if (r1_bh) { memset(r1_bh, 0, sizeof(*r1_bh)); return r1_bh; } conf->freer1_blocked = 1; wait_disk_event(conf->wait_buffer, !conf->freer1_blocked || conf->freer1_cnt > NR_RESERVED_BUFS/2 ); conf->freer1_blocked = 0; } while (1); } static inline void raid1_free_r1bh(struct raid1_bh *r1_bh) { struct buffer_head *bh = r1_bh->mirror_bh_list; raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); r1_bh->mirror_bh_list = NULL; if (test_bit(R1BH_PreAlloc, &r1_bh->state)) { unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); r1_bh->next_r1 = conf->freer1; conf->freer1 = r1_bh; conf->freer1_cnt++; spin_unlock_irqrestore(&conf->device_lock, flags); /* don't need to wakeup wait_buffer because * raid1_free_bh below will do that */ } else { kfree(r1_bh); } raid1_free_bh(conf, bh); } static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt) { int i = 0; while (i < cnt) { struct raid1_bh *r1_bh; r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL); if (!r1_bh) break; memset(r1_bh, 0, sizeof(*r1_bh)); set_bit(R1BH_PreAlloc, &r1_bh->state); r1_bh->mddev = conf->mddev; raid1_free_r1bh(r1_bh); i++; } return i; } static void raid1_shrink_r1bh(raid1_conf_t *conf) { md_spin_lock_irq(&conf->device_lock); while (conf->freer1) { struct raid1_bh *r1_bh = conf->freer1; conf->freer1 = r1_bh->next_r1; conf->freer1_cnt--; kfree(r1_bh); } md_spin_unlock_irq(&conf->device_lock); } static inline void raid1_free_buf(struct raid1_bh *r1_bh) { unsigned long flags; struct buffer_head *bh = r1_bh->mirror_bh_list; raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); r1_bh->mirror_bh_list = NULL; spin_lock_irqsave(&conf->device_lock, flags); r1_bh->next_r1 = conf->freebuf; conf->freebuf = r1_bh; spin_unlock_irqrestore(&conf->device_lock, flags); raid1_free_bh(conf, bh); } static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf) { struct raid1_bh *r1_bh; md_spin_lock_irq(&conf->device_lock); wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock); r1_bh = conf->freebuf; conf->freebuf = r1_bh->next_r1; r1_bh->next_r1= NULL; md_spin_unlock_irq(&conf->device_lock); return r1_bh; } static int raid1_grow_buffers (raid1_conf_t *conf, int cnt) { int i = 0; md_spin_lock_irq(&conf->device_lock); while (i < cnt) { struct raid1_bh *r1_bh; struct page *page; page = alloc_page(GFP_KERNEL); if (!page) break; r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL); if (!r1_bh) { __free_page(page); break; } memset(r1_bh, 0, sizeof(*r1_bh)); r1_bh->bh_req.b_page = page; r1_bh->bh_req.b_data = page_address(page); r1_bh->next_r1 = conf->freebuf; conf->freebuf = r1_bh; i++; } md_spin_unlock_irq(&conf->device_lock); return i; } static void raid1_shrink_buffers (raid1_conf_t *conf) { md_spin_lock_irq(&conf->device_lock); while (conf->freebuf) { struct raid1_bh *r1_bh = conf->freebuf; conf->freebuf = r1_bh->next_r1; __free_page(r1_bh->bh_req.b_page); kfree(r1_bh); } md_spin_unlock_irq(&conf->device_lock); } static int raid1_map (mddev_t *mddev, kdev_t *rdev) { raid1_conf_t *conf = mddev_to_conf(mddev); int i, disks = MD_SB_DISKS; /* * Later we do read balancing on the read side * now we use the first available disk. */ for (i = 0; i < disks; i++) { if (conf->mirrors[i].operational) { *rdev = conf->mirrors[i].dev; return (0); } } printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n"); return (-1); } static void raid1_reschedule_retry (struct raid1_bh *r1_bh) { unsigned long flags; mddev_t *mddev = r1_bh->mddev; raid1_conf_t *conf = mddev_to_conf(mddev); md_spin_lock_irqsave(&retry_list_lock, flags); if (raid1_retry_list == NULL) raid1_retry_tail = &raid1_retry_list; *raid1_retry_tail = r1_bh; raid1_retry_tail = &r1_bh->next_r1; r1_bh->next_r1 = NULL; md_spin_unlock_irqrestore(&retry_list_lock, flags); md_wakeup_thread(conf->thread); } static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase) { unsigned long flags; spin_lock_irqsave(&conf->segment_lock, flags); if (sector < conf->start_active) conf->cnt_done--; else if (sector >= conf->start_future && conf->phase == phase) conf->cnt_future--; else if (!--conf->cnt_pending) wake_up(&conf->wait_ready); spin_unlock_irqrestore(&conf->segment_lock, flags); } static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf) { unsigned long flags; spin_lock_irqsave(&conf->segment_lock, flags); if (sector >= conf->start_ready) --conf->cnt_ready; else if (sector >= conf->start_active) { if (!--conf->cnt_active) { conf->start_active = conf->start_ready; wake_up(&conf->wait_done); } } spin_unlock_irqrestore(&conf->segment_lock, flags); } /* * raid1_end_bh_io() is called when we have finished servicing a mirrored * operation and are ready to return a success/failure code to the buffer * cache layer. */ static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate) { struct buffer_head *bh = r1_bh->master_bh; io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev), test_bit(R1BH_SyncPhase, &r1_bh->state)); bh->b_end_io(bh, uptodate); raid1_free_r1bh(r1_bh); } void raid1_end_request (struct buffer_head *bh, int uptodate) { struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); /* * this branch is our 'one mirror IO has finished' event handler: */ if (!uptodate) md_error (r1_bh->mddev, bh->b_dev); else /* * Set R1BH_Uptodate in our master buffer_head, so that * we will return a good error code for to the higher * levels even if IO on some other mirrored buffer fails. * * The 'master' represents the complex operation to * user-side. So if something waits for IO, then it will * wait for the 'master' buffer_head. */ set_bit (R1BH_Uptodate, &r1_bh->state); /* * We split up the read and write side, imho they are * conceptually different. */ if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) { /* * we have only one buffer_head on the read side */ if (uptodate) { raid1_end_bh_io(r1_bh, uptodate); return; } /* * oops, read error: */ printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", partition_name(bh->b_dev), bh->b_blocknr); raid1_reschedule_retry(r1_bh); return; } /* * WRITE: * * Let's see if all mirrored write operations have finished * already. */ if (atomic_dec_and_test(&r1_bh->remaining)) raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state)); } /* * This routine returns the disk from which the requested read should * be done. It bookkeeps the last read position for every disk * in array and when new read requests come, the disk which last * position is nearest to the request, is chosen. * * TODO: now if there are 2 mirrors in the same 2 devices, performance * degrades dramatically because position is mirror, not device based. * This should be changed to be device based. Also atomic sequential * reads should be somehow balanced. */ static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh) { int new_disk = conf->last_used; const int sectors = bh->b_size >> 9; const unsigned long this_sector = bh->b_rsector; int disk = new_disk; unsigned long new_distance; unsigned long current_distance; /* * Check if it is sane at all to balance */ if (conf->resync_mirrors) goto rb_out; /* make sure that disk is operational */ while( !conf->mirrors[new_disk].operational) { if (new_disk <= 0) new_disk = conf->raid_disks; new_disk--; if (new_disk == disk) { /* * This means no working disk was found * Nothing much to do, lets not change anything * and hope for the best... */ new_disk = conf->last_used; goto rb_out; } } disk = new_disk; /* now disk == new_disk == starting point for search */ /* * Don't touch anything for sequential reads. */ if (this_sector == conf->mirrors[new_disk].head_position) goto rb_out; /* * If reads have been done only on a single disk * for a time, lets give another disk a change. * This is for kicking those idling disks so that * they would find work near some hotspot. */ if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) { conf->sect_count = 0; do { if (new_disk<=0) new_disk = conf->raid_disks; new_disk--; if (new_disk == disk) break; } while ((conf->mirrors[new_disk].write_only) || (!conf->mirrors[new_disk].operational)); goto rb_out; } current_distance = abs(this_sector - conf->mirrors[disk].head_position); /* Find the disk which is closest */ do { if (disk <= 0) disk = conf->raid_disks; disk--; if ((conf->mirrors[disk].write_only) || (!conf->mirrors[disk].operational)) continue; new_distance = abs(this_sector - conf->mirrors[disk].head_position); if (new_distance < current_distance) { conf->sect_count = 0; current_distance = new_distance; new_disk = disk; } } while (disk != conf->last_used); rb_out: conf->mirrors[new_disk].head_position = this_sector + sectors; conf->last_used = new_disk; conf->sect_count += sectors; return new_disk; } static int raid1_make_request (mddev_t *mddev, int rw, struct buffer_head * bh) { raid1_conf_t *conf = mddev_to_conf(mddev); struct buffer_head *bh_req, *bhl; struct raid1_bh * r1_bh; int disks = MD_SB_DISKS; int i, sum_bhs = 0; struct mirror_info *mirror; if (!buffer_locked(bh)) BUG(); /* * make_request() can abort the operation when READA is being * used and no empty request is available. * * Currently, just replace the command with READ/WRITE. */ if (rw == READA) rw = READ; r1_bh = raid1_alloc_r1bh (conf); spin_lock_irq(&conf->segment_lock); wait_event_lock_irq(conf->wait_done, bh->b_rsector < conf->start_active || bh->b_rsector >= conf->start_future, conf->segment_lock); if (bh->b_rsector < conf->start_active) conf->cnt_done++; else { conf->cnt_future++; if (conf->phase) set_bit(R1BH_SyncPhase, &r1_bh->state); } spin_unlock_irq(&conf->segment_lock); /* * i think the read and write branch should be separated completely, * since we want to do read balancing on the read side for example. * Alternative implementations? :) --mingo */ r1_bh->master_bh = bh; r1_bh->mddev = mddev; r1_bh->cmd = rw; if (rw == READ) { /* * read balancing logic: */ mirror = conf->mirrors + raid1_read_balance(conf, bh); bh_req = &r1_bh->bh_req; memcpy(bh_req, bh, sizeof(*bh)); bh_req->b_blocknr = bh->b_rsector; bh_req->b_dev = mirror->dev; bh_req->b_rdev = mirror->dev; /* bh_req->b_rsector = bh->n_rsector; */ bh_req->b_end_io = raid1_end_request; bh_req->b_private = r1_bh; generic_make_request (rw, bh_req); return 0; } /* * WRITE: */ bhl = raid1_alloc_bh(conf, conf->raid_disks); for (i = 0; i < disks; i++) { struct buffer_head *mbh; if (!conf->mirrors[i].operational) continue; /* * We should use a private pool (size depending on NR_REQUEST), * to avoid writes filling up the memory with bhs * * Such pools are much faster than kmalloc anyways (so we waste * almost nothing by not using the master bh when writing and * win alot of cleanness) but for now we are cool enough. --mingo * * It's safe to sleep here, buffer heads cannot be used in a shared * manner in the write branch. Look how we lock the buffer at the * beginning of this function to grok the difference ;) */ mbh = bhl; if (mbh == NULL) { MD_BUG(); break; } bhl = mbh->b_next; mbh->b_next = NULL; mbh->b_this_page = (struct buffer_head *)1; /* * prepare mirrored mbh (fields ordered for max mem throughput): */ mbh->b_blocknr = bh->b_rsector; mbh->b_dev = conf->mirrors[i].dev; mbh->b_rdev = conf->mirrors[i].dev; mbh->b_rsector = bh->b_rsector; mbh->b_state = (1<b_count, 1); mbh->b_size = bh->b_size; mbh->b_page = bh->b_page; mbh->b_data = bh->b_data; mbh->b_list = BUF_LOCKED; mbh->b_end_io = raid1_end_request; mbh->b_private = r1_bh; mbh->b_next = r1_bh->mirror_bh_list; r1_bh->mirror_bh_list = mbh; sum_bhs++; } if (bhl) raid1_free_bh(conf,bhl); if (!sum_bhs) { /* Gag - all mirrors non-operational.. */ raid1_end_bh_io(r1_bh, 0); return 0; } md_atomic_set(&r1_bh->remaining, sum_bhs); /* * We have to be a bit careful about the semaphore above, thats * why we start the requests separately. Since kmalloc() could * fail, sleep and make_request() can sleep too, this is the * safer solution. Imagine, end_request decreasing the semaphore * before we could have set it up ... We could play tricks with * the semaphore (presetting it and correcting at the end if * sum_bhs is not 'n' but we have to do end_request by hand if * all requests finish until we had a chance to set up the * semaphore correctly ... lots of races). */ bh = r1_bh->mirror_bh_list; while(bh) { struct buffer_head *bh2 = bh; bh = bh->b_next; generic_make_request(rw, bh2); } return (0); } static int raid1_status (char *page, mddev_t *mddev) { raid1_conf_t *conf = mddev_to_conf(mddev); int sz = 0, i; sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks); for (i = 0; i < conf->raid_disks; i++) sz += sprintf (page+sz, "%s", conf->mirrors[i].operational ? "U" : "_"); sz += sprintf (page+sz, "]"); return sz; } #define LAST_DISK KERN_ALERT \ "raid1: only one disk left and IO error.\n" #define NO_SPARE_DISK KERN_ALERT \ "raid1: no spare disk left, degrading mirror level by one.\n" #define DISK_FAILED KERN_ALERT \ "raid1: Disk failure on %s, disabling device. \n" \ " Operation continuing on %d devices\n" #define START_SYNCING KERN_ALERT \ "raid1: start syncing spare disk.\n" #define ALREADY_SYNCING KERN_INFO \ "raid1: syncing already in progress.\n" static void mark_disk_bad (mddev_t *mddev, int failed) { raid1_conf_t *conf = mddev_to_conf(mddev); struct mirror_info *mirror = conf->mirrors+failed; mdp_super_t *sb = mddev->sb; mirror->operational = 0; mark_disk_faulty(sb->disks+mirror->number); mark_disk_nonsync(sb->disks+mirror->number); mark_disk_inactive(sb->disks+mirror->number); if (!mirror->write_only) sb->active_disks--; sb->working_disks--; sb->failed_disks++; mddev->sb_dirty = 1; md_wakeup_thread(conf->thread); if (!mirror->write_only) conf->working_disks--; printk (DISK_FAILED, partition_name (mirror->dev), conf->working_disks); } static int raid1_error (mddev_t *mddev, kdev_t dev) { raid1_conf_t *conf = mddev_to_conf(mddev); struct mirror_info * mirrors = conf->mirrors; int disks = MD_SB_DISKS; int i; /* Find the drive. * If it is not operational, then we have already marked it as dead * else if it is the last working disks, ignore the error, let the * next level up know. * else mark the drive as failed */ for (i = 0; i < disks; i++) if (mirrors[i].dev==dev && mirrors[i].operational) break; if (i == disks) return 0; if (i < conf->raid_disks && conf->working_disks == 1) { /* Don't fail the drive, act as though we were just a * normal single drive */ return 1; } mark_disk_bad(mddev, i); return 0; } #undef LAST_DISK #undef NO_SPARE_DISK #undef DISK_FAILED #undef START_SYNCING static void print_raid1_conf (raid1_conf_t *conf) { int i; struct mirror_info *tmp; printk("RAID1 conf printout:\n"); if (!conf) { printk("(conf==NULL)\n"); return; } printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks, conf->raid_disks, conf->nr_disks); for (i = 0; i < MD_SB_DISKS; i++) { tmp = conf->mirrors + i; printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", i, tmp->spare,tmp->operational, tmp->number,tmp->raid_disk,tmp->used_slot, partition_name(tmp->dev)); } } static void close_sync(raid1_conf_t *conf) { mddev_t *mddev = conf->mddev; /* If reconstruction was interrupted, we need to close the "active" and "pending" * holes. * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0 */ /* this is really needed when recovery stops too... */ spin_lock_irq(&conf->segment_lock); conf->start_active = conf->start_pending; conf->start_ready = conf->start_pending; wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); conf->start_active =conf->start_ready = conf->start_pending = conf->start_future; conf->start_future = mddev->sb->size+1; conf->cnt_pending = conf->cnt_future; conf->cnt_future = 0; conf->phase = conf->phase ^1; wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0; conf->phase = 0; conf->cnt_future = conf->cnt_done;; conf->cnt_done = 0; spin_unlock_irq(&conf->segment_lock); wake_up(&conf->wait_done); } static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state) { int err = 0; int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; raid1_conf_t *conf = mddev->private; struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; mdp_super_t *sb = mddev->sb; mdp_disk_t *failed_desc, *spare_desc, *added_desc; mdk_rdev_t *spare_rdev, *failed_rdev; print_raid1_conf(conf); md_spin_lock_irq(&conf->device_lock); /* * find the disk ... */ switch (state) { case DISKOP_SPARE_ACTIVE: /* * Find the failed disk within the RAID1 configuration ... * (this can only be in the first conf->working_disks part) */ for (i = 0; i < conf->raid_disks; i++) { tmp = conf->mirrors + i; if ((!tmp->operational && !tmp->spare) || !tmp->used_slot) { failed_disk = i; break; } } /* * When we activate a spare disk we _must_ have a disk in * the lower (active) part of the array to replace. */ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { MD_BUG(); err = 1; goto abort; } /* fall through */ case DISKOP_SPARE_WRITE: case DISKOP_SPARE_INACTIVE: /* * Find the spare disk ... (can only be in the 'high' * area of the array) */ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { tmp = conf->mirrors + i; if (tmp->spare && tmp->number == (*d)->number) { spare_disk = i; break; } } if (spare_disk == -1) { MD_BUG(); err = 1; goto abort; } break; case DISKOP_HOT_REMOVE_DISK: for (i = 0; i < MD_SB_DISKS; i++) { tmp = conf->mirrors + i; if (tmp->used_slot && (tmp->number == (*d)->number)) { if (tmp->operational) { err = -EBUSY; goto abort; } removed_disk = i; break; } } if (removed_disk == -1) { MD_BUG(); err = 1; goto abort; } break; case DISKOP_HOT_ADD_DISK: for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { tmp = conf->mirrors + i; if (!tmp->used_slot) { added_disk = i; break; } } if (added_disk == -1) { MD_BUG(); err = 1; goto abort; } break; } switch (state) { /* * Switch the spare disk to write-only mode: */ case DISKOP_SPARE_WRITE: sdisk = conf->mirrors + spare_disk; sdisk->operational = 1; sdisk->write_only = 1; break; /* * Deactivate a spare disk: */ case DISKOP_SPARE_INACTIVE: close_sync(conf); sdisk = conf->mirrors + spare_disk; sdisk->operational = 0; sdisk->write_only = 0; break; /* * Activate (mark read-write) the (now sync) spare disk, * which means we switch it's 'raid position' (->raid_disk) * with the failed disk. (only the first 'conf->nr_disks' * slots are used for 'real' disks and we must preserve this * property) */ case DISKOP_SPARE_ACTIVE: close_sync(conf); sdisk = conf->mirrors + spare_disk; fdisk = conf->mirrors + failed_disk; spare_desc = &sb->disks[sdisk->number]; failed_desc = &sb->disks[fdisk->number]; if (spare_desc != *d) { MD_BUG(); err = 1; goto abort; } if (spare_desc->raid_disk != sdisk->raid_disk) { MD_BUG(); err = 1; goto abort; } if (sdisk->raid_disk != spare_disk) { MD_BUG(); err = 1; goto abort; } if (failed_desc->raid_disk != fdisk->raid_disk) { MD_BUG(); err = 1; goto abort; } if (fdisk->raid_disk != failed_disk) { MD_BUG(); err = 1; goto abort; } /* * do the switch finally */ spare_rdev = find_rdev_nr(mddev, spare_desc->number); failed_rdev = find_rdev_nr(mddev, failed_desc->number); /* There must be a spare_rdev, but there may not be a * failed_rdev. That slot might be empty... */ spare_rdev->desc_nr = failed_desc->number; if (failed_rdev) failed_rdev->desc_nr = spare_desc->number; xchg_values(*spare_desc, *failed_desc); xchg_values(*fdisk, *sdisk); /* * (careful, 'failed' and 'spare' are switched from now on) * * we want to preserve linear numbering and we want to * give the proper raid_disk number to the now activated * disk. (this means we switch back these values) */ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); xchg_values(sdisk->raid_disk, fdisk->raid_disk); xchg_values(spare_desc->number, failed_desc->number); xchg_values(sdisk->number, fdisk->number); *d = failed_desc; if (sdisk->dev == MKDEV(0,0)) sdisk->used_slot = 0; /* * this really activates the spare. */ fdisk->spare = 0; fdisk->write_only = 0; /* * if we activate a spare, we definitely replace a * non-operational disk slot in the 'low' area of * the disk array. */ conf->working_disks++; break; case DISKOP_HOT_REMOVE_DISK: rdisk = conf->mirrors + removed_disk; if (rdisk->spare && (removed_disk < conf->raid_disks)) { MD_BUG(); err = 1; goto abort; } rdisk->dev = MKDEV(0,0); rdisk->used_slot = 0; conf->nr_disks--; break; case DISKOP_HOT_ADD_DISK: adisk = conf->mirrors + added_disk; added_desc = *d; if (added_disk != added_desc->number) { MD_BUG(); err = 1; goto abort; } adisk->number = added_desc->number; adisk->raid_disk = added_desc->raid_disk; adisk->dev = MKDEV(added_desc->major,added_desc->minor); adisk->operational = 0; adisk->write_only = 0; adisk->spare = 1; adisk->used_slot = 1; adisk->head_position = 0; conf->nr_disks++; break; default: MD_BUG(); err = 1; goto abort; } abort: md_spin_unlock_irq(&conf->device_lock); if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) /* should move to "END_REBUILD" when such exists */ raid1_shrink_buffers(conf); print_raid1_conf(conf); return err; } #define IO_ERROR KERN_ALERT \ "raid1: %s: unrecoverable I/O read error for block %lu\n" #define REDIRECT_SECTOR KERN_ERR \ "raid1: %s: redirecting sector %lu to another mirror\n" /* * This is a kernel thread which: * * 1. Retries failed read operations on working mirrors. * 2. Updates the raid superblock when problems encounter. * 3. Performs writes following reads for array syncronising. */ static void end_sync_write(struct buffer_head *bh, int uptodate); static void end_sync_read(struct buffer_head *bh, int uptodate); static void raid1d (void *data) { struct raid1_bh *r1_bh; struct buffer_head *bh; unsigned long flags; mddev_t *mddev; kdev_t dev; for (;;) { md_spin_lock_irqsave(&retry_list_lock, flags); r1_bh = raid1_retry_list; if (!r1_bh) break; raid1_retry_list = r1_bh->next_r1; md_spin_unlock_irqrestore(&retry_list_lock, flags); mddev = r1_bh->mddev; if (mddev->sb_dirty) { printk(KERN_INFO "raid1: dirty sb detected, updating.\n"); mddev->sb_dirty = 0; md_update_sb(mddev); } bh = &r1_bh->bh_req; switch(r1_bh->cmd) { case SPECIAL: /* have to allocate lots of bh structures and * schedule writes */ if (test_bit(R1BH_Uptodate, &r1_bh->state)) { int i, sum_bhs = 0; int disks = MD_SB_DISKS; struct buffer_head *bhl, *mbh; raid1_conf_t *conf; conf = mddev_to_conf(mddev); bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */ for (i = 0; i < disks ; i++) { if (!conf->mirrors[i].operational) continue; if (i==conf->last_used) /* we read from here, no need to write */ continue; if (i < conf->raid_disks && !conf->resync_mirrors) /* don't need to write this, * we are just rebuilding */ continue; mbh = bhl; if (!mbh) { MD_BUG(); break; } bhl = mbh->b_next; mbh->b_this_page = (struct buffer_head *)1; /* * prepare mirrored bh (fields ordered for max mem throughput): */ mbh->b_blocknr = bh->b_blocknr; mbh->b_dev = conf->mirrors[i].dev; mbh->b_rdev = conf->mirrors[i].dev; mbh->b_rsector = bh->b_blocknr; mbh->b_state = (1<b_count, 1); mbh->b_size = bh->b_size; mbh->b_page = bh->b_page; mbh->b_data = bh->b_data; mbh->b_list = BUF_LOCKED; mbh->b_end_io = end_sync_write; mbh->b_private = r1_bh; mbh->b_next = r1_bh->mirror_bh_list; r1_bh->mirror_bh_list = mbh; sum_bhs++; } md_atomic_set(&r1_bh->remaining, sum_bhs); if (bhl) raid1_free_bh(conf, bhl); mbh = r1_bh->mirror_bh_list; if (!sum_bhs) { /* nowhere to write this too... I guess we * must be done */ sync_request_done(bh->b_blocknr, conf); md_done_sync(mddev, bh->b_size>>9, 0); raid1_free_buf(r1_bh); } else while (mbh) { struct buffer_head *bh1 = mbh; mbh = mbh->b_next; generic_make_request(WRITE, bh1); md_sync_acct(bh1->b_dev, bh1->b_size/512); } } else { /* There is no point trying a read-for-reconstruct * as reconstruct is about to be aborted */ printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); md_done_sync(mddev, bh->b_size>>9, 0); } break; case READ: case READA: dev = bh->b_dev; raid1_map (mddev, &bh->b_dev); if (bh->b_dev == dev) { printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); raid1_end_bh_io(r1_bh, 0); } else { printk (REDIRECT_SECTOR, partition_name(bh->b_dev), bh->b_blocknr); bh->b_rdev = bh->b_dev; bh->b_rsector = bh->b_blocknr; generic_make_request (r1_bh->cmd, bh); } break; } } md_spin_unlock_irqrestore(&retry_list_lock, flags); } #undef IO_ERROR #undef REDIRECT_SECTOR /* * Private kernel thread to reconstruct mirrors after an unclean * shutdown. */ static void raid1syncd (void *data) { raid1_conf_t *conf = data; mddev_t *mddev = conf->mddev; if (!conf->resync_mirrors) return; if (conf->resync_mirrors == 2) return; down(&mddev->recovery_sem); if (!md_do_sync(mddev, NULL)) { /* * Only if everything went Ok. */ conf->resync_mirrors = 0; } close_sync(conf); up(&mddev->recovery_sem); raid1_shrink_buffers(conf); } /* * perform a "sync" on one "block" * * We need to make sure that no normal I/O request - particularly write * requests - conflict with active sync requests. * This is achieved by conceptually dividing the device space into a * number of sections: * DONE: 0 .. a-1 These blocks are in-sync * ACTIVE: a.. b-1 These blocks may have active sync requests, but * no normal IO requests * READY: b .. c-1 These blocks have no normal IO requests - sync * request may be happening * PENDING: c .. d-1 These blocks may have IO requests, but no new * ones will be added * FUTURE: d .. end These blocks are not to be considered yet. IO may * be happening, but not sync * * We keep a * phase which flips (0 or 1) each time d moves and * a count of: * z = active io requests in FUTURE since d moved - marked with * current phase * y = active io requests in FUTURE before d moved, or PENDING - * marked with previous phase * x = active sync requests in READY * w = active sync requests in ACTIVE * v = active io requests in DONE * * Normally, a=b=c=d=0 and z= active io requests * or a=b=c=d=END and v= active io requests * Allowed changes to a,b,c,d: * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase * B: y==0 -> c=d * C: b=c, w+=x, x=0 * D: w==0 -> a=b * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0 * * At start of sync we apply A. * When y reaches 0, we apply B then A then being sync requests * When sync point reaches c-1, we wait for y==0, and W==0, and * then apply apply B then A then D then C. * Finally, we apply E * * The sync request simply issues a "read" against a working drive * This is marked so that on completion the raid1d thread is woken to * issue suitable write requests */ static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr) { raid1_conf_t *conf = mddev_to_conf(mddev); struct mirror_info *mirror; struct raid1_bh *r1_bh; struct buffer_head *bh; int bsize; int disk; int block_nr; spin_lock_irq(&conf->segment_lock); if (!sector_nr) { /* initialize ...*/ int buffs; conf->start_active = 0; conf->start_ready = 0; conf->start_pending = 0; conf->start_future = 0; conf->phase = 0; /* we want enough buffers to hold twice the window of 128*/ buffs = 128 *2 / (PAGE_SIZE>>9); buffs = raid1_grow_buffers(conf, buffs); if (buffs < 2) goto nomem; conf->window = buffs*(PAGE_SIZE>>9)/2; conf->cnt_future += conf->cnt_done+conf->cnt_pending; conf->cnt_done = conf->cnt_pending = 0; if (conf->cnt_ready || conf->cnt_active) MD_BUG(); } while (sector_nr >= conf->start_pending) { PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n", sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future, conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future); wait_event_lock_irq(conf->wait_done, !conf->cnt_active, conf->segment_lock); wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); conf->start_active = conf->start_ready; conf->start_ready = conf->start_pending; conf->start_pending = conf->start_future; conf->start_future = conf->start_future+conf->window; // Note: falling off the end is not a problem conf->phase = conf->phase ^1; conf->cnt_active = conf->cnt_ready; conf->cnt_ready = 0; conf->cnt_pending = conf->cnt_future; conf->cnt_future = 0; wake_up(&conf->wait_done); } conf->cnt_ready++; spin_unlock_irq(&conf->segment_lock); /* If reconstructing, and >1 working disc, * could dedicate one to rebuild and others to * service read requests .. */ disk = conf->last_used; /* make sure disk is operational */ while (!conf->mirrors[disk].operational) { if (disk <= 0) disk = conf->raid_disks; disk--; if (disk == conf->last_used) break; } conf->last_used = disk; mirror = conf->mirrors+conf->last_used; r1_bh = raid1_alloc_buf (conf); r1_bh->master_bh = NULL; r1_bh->mddev = mddev; r1_bh->cmd = SPECIAL; bh = &r1_bh->bh_req; block_nr = sector_nr; bsize = 512; while (!(block_nr & 1) && bsize < PAGE_SIZE && (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) { block_nr >>= 1; bsize <<= 1; } bh->b_size = bsize; bh->b_list = BUF_LOCKED; bh->b_dev = mirror->dev; bh->b_rdev = mirror->dev; bh->b_state = (1<b_page) BUG(); if (!bh->b_data) BUG(); if (bh->b_data != page_address(bh->b_page)) BUG(); bh->b_end_io = end_sync_read; bh->b_private = r1_bh; bh->b_blocknr = sector_nr; bh->b_rsector = sector_nr; init_waitqueue_head(&bh->b_wait); generic_make_request(READ, bh); md_sync_acct(bh->b_dev, bh->b_size/512); return (bsize >> 9); nomem: raid1_shrink_buffers(conf); spin_unlock_irq(&conf->segment_lock); return -ENOMEM; } static void end_sync_read(struct buffer_head *bh, int uptodate) { struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); /* we have read a block, now it needs to be re-written, * or re-read if the read failed. * We don't do much here, just schedule handling by raid1d */ if (!uptodate) md_error (r1_bh->mddev, bh->b_dev); else set_bit(R1BH_Uptodate, &r1_bh->state); raid1_reschedule_retry(r1_bh); } static void end_sync_write(struct buffer_head *bh, int uptodate) { struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); if (!uptodate) md_error (r1_bh->mddev, bh->b_dev); if (atomic_dec_and_test(&r1_bh->remaining)) { mddev_t *mddev = r1_bh->mddev; unsigned long sect = bh->b_blocknr; int size = bh->b_size; raid1_free_buf(r1_bh); sync_request_done(sect, mddev_to_conf(mddev)); md_done_sync(mddev,size>>9, uptodate); } } #define INVALID_LEVEL KERN_WARNING \ "raid1: md%d: raid level not set to mirroring (%d)\n" #define NO_SB KERN_ERR \ "raid1: disabled mirror %s (couldn't access raid superblock)\n" #define ERRORS KERN_ERR \ "raid1: disabled mirror %s (errors detected)\n" #define NOT_IN_SYNC KERN_ERR \ "raid1: disabled mirror %s (not in sync)\n" #define INCONSISTENT KERN_ERR \ "raid1: disabled mirror %s (inconsistent descriptor)\n" #define ALREADY_RUNNING KERN_ERR \ "raid1: disabled mirror %s (mirror %d already operational)\n" #define OPERATIONAL KERN_INFO \ "raid1: device %s operational as mirror %d\n" #define MEM_ERROR KERN_ERR \ "raid1: couldn't allocate memory for md%d\n" #define SPARE KERN_INFO \ "raid1: spare disk %s\n" #define NONE_OPERATIONAL KERN_ERR \ "raid1: no operational mirrors for md%d\n" #define ARRAY_IS_ACTIVE KERN_INFO \ "raid1: raid set md%d active with %d out of %d mirrors\n" #define THREAD_ERROR KERN_ERR \ "raid1: couldn't allocate thread for md%d\n" #define START_RESYNC KERN_WARNING \ "raid1: raid set md%d not clean; reconstructing mirrors\n" static int raid1_run (mddev_t *mddev) { raid1_conf_t *conf; int i, j, disk_idx; struct mirror_info *disk; mdp_super_t *sb = mddev->sb; mdp_disk_t *descriptor; mdk_rdev_t *rdev; struct md_list_head *tmp; int start_recovery = 0; MOD_INC_USE_COUNT; if (sb->level != 1) { printk(INVALID_LEVEL, mdidx(mddev), sb->level); goto out; } /* * copy the already verified devices into our private RAID1 * bookkeeping area. [whatever we allocate in raid1_run(), * should be freed in raid1_stop()] */ conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL); mddev->private = conf; if (!conf) { printk(MEM_ERROR, mdidx(mddev)); goto out; } memset(conf, 0, sizeof(*conf)); ITERATE_RDEV(mddev,rdev,tmp) { if (rdev->faulty) { printk(ERRORS, partition_name(rdev->dev)); } else { if (!rdev->sb) { MD_BUG(); continue; } } if (rdev->desc_nr == -1) { MD_BUG(); continue; } descriptor = &sb->disks[rdev->desc_nr]; disk_idx = descriptor->raid_disk; disk = conf->mirrors + disk_idx; if (disk_faulty(descriptor)) { disk->number = descriptor->number; disk->raid_disk = disk_idx; disk->dev = rdev->dev; disk->sect_limit = MAX_WORK_PER_DISK; disk->operational = 0; disk->write_only = 0; disk->spare = 0; disk->used_slot = 1; disk->head_position = 0; continue; } if (disk_active(descriptor)) { if (!disk_sync(descriptor)) { printk(NOT_IN_SYNC, partition_name(rdev->dev)); continue; } if ((descriptor->number > MD_SB_DISKS) || (disk_idx > sb->raid_disks)) { printk(INCONSISTENT, partition_name(rdev->dev)); continue; } if (disk->operational) { printk(ALREADY_RUNNING, partition_name(rdev->dev), disk_idx); continue; } printk(OPERATIONAL, partition_name(rdev->dev), disk_idx); disk->number = descriptor->number; disk->raid_disk = disk_idx; disk->dev = rdev->dev; disk->sect_limit = MAX_WORK_PER_DISK; disk->operational = 1; disk->write_only = 0; disk->spare = 0; disk->used_slot = 1; disk->head_position = 0; conf->working_disks++; } else { /* * Must be a spare disk .. */ printk(SPARE, partition_name(rdev->dev)); disk->number = descriptor->number; disk->raid_disk = disk_idx; disk->dev = rdev->dev; disk->sect_limit = MAX_WORK_PER_DISK; disk->operational = 0; disk->write_only = 0; disk->spare = 1; disk->used_slot = 1; disk->head_position = 0; } } conf->raid_disks = sb->raid_disks; conf->nr_disks = sb->nr_disks; conf->mddev = mddev; conf->device_lock = MD_SPIN_LOCK_UNLOCKED; conf->segment_lock = MD_SPIN_LOCK_UNLOCKED; init_waitqueue_head(&conf->wait_buffer); init_waitqueue_head(&conf->wait_done); init_waitqueue_head(&conf->wait_ready); if (!conf->working_disks) { printk(NONE_OPERATIONAL, mdidx(mddev)); goto out_free_conf; } /* pre-allocate some buffer_head structures. * As a minimum, 1 r1bh and raid_disks buffer_heads * would probably get us by in tight memory situations, * but a few more is probably a good idea. * For now, try NR_RESERVED_BUFS r1bh and * NR_RESERVED_BUFS*raid_disks bufferheads * This will allow at least NR_RESERVED_BUFS concurrent * reads or writes even if kmalloc starts failing */ if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS || raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks) < NR_RESERVED_BUFS*conf->raid_disks) { printk(MEM_ERROR, mdidx(mddev)); goto out_free_conf; } for (i = 0; i < MD_SB_DISKS; i++) { descriptor = sb->disks+i; disk_idx = descriptor->raid_disk; disk = conf->mirrors + disk_idx; if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) && !disk->used_slot) { disk->number = descriptor->number; disk->raid_disk = disk_idx; disk->dev = MKDEV(0,0); disk->operational = 0; disk->write_only = 0; disk->spare = 0; disk->used_slot = 1; disk->head_position = 0; } } /* * find the first working one and use it as a starting point * to read balancing. */ for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++) /* nothing */; conf->last_used = j; if (conf->working_disks != sb->raid_disks) { printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); start_recovery = 1; } { const char * name = "raid1d"; conf->thread = md_register_thread(raid1d, conf, name); if (!conf->thread) { printk(THREAD_ERROR, mdidx(mddev)); goto out_free_conf; } } if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) && (conf->working_disks > 1)) { const char * name = "raid1syncd"; conf->resync_thread = md_register_thread(raid1syncd, conf,name); if (!conf->resync_thread) { printk(THREAD_ERROR, mdidx(mddev)); goto out_free_conf; } printk(START_RESYNC, mdidx(mddev)); conf->resync_mirrors = 1; md_wakeup_thread(conf->resync_thread); } /* * Regenerate the "device is in sync with the raid set" bit for * each device. */ for (i = 0; i < MD_SB_DISKS; i++) { mark_disk_nonsync(sb->disks+i); for (j = 0; j < sb->raid_disks; j++) { if (!conf->mirrors[j].operational) continue; if (sb->disks[i].number == conf->mirrors[j].number) mark_disk_sync(sb->disks+i); } } sb->active_disks = conf->working_disks; if (start_recovery) md_recover_arrays(); printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks); /* * Ok, everything is just fine now */ return 0; out_free_conf: raid1_shrink_r1bh(conf); raid1_shrink_bh(conf); raid1_shrink_buffers(conf); kfree(conf); mddev->private = NULL; out: MOD_DEC_USE_COUNT; return -EIO; } #undef INVALID_LEVEL #undef NO_SB #undef ERRORS #undef NOT_IN_SYNC #undef INCONSISTENT #undef ALREADY_RUNNING #undef OPERATIONAL #undef SPARE #undef NONE_OPERATIONAL #undef ARRAY_IS_ACTIVE static int raid1_stop_resync (mddev_t *mddev) { raid1_conf_t *conf = mddev_to_conf(mddev); if (conf->resync_thread) { if (conf->resync_mirrors) { conf->resync_mirrors = 2; md_interrupt_thread(conf->resync_thread); printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); return 1; } return 0; } return 0; } static int raid1_restart_resync (mddev_t *mddev) { raid1_conf_t *conf = mddev_to_conf(mddev); if (conf->resync_mirrors) { if (!conf->resync_thread) { MD_BUG(); return 0; } conf->resync_mirrors = 1; md_wakeup_thread(conf->resync_thread); return 1; } return 0; } static int raid1_stop (mddev_t *mddev) { raid1_conf_t *conf = mddev_to_conf(mddev); md_unregister_thread(conf->thread); if (conf->resync_thread) md_unregister_thread(conf->resync_thread); raid1_shrink_r1bh(conf); raid1_shrink_bh(conf); raid1_shrink_buffers(conf); kfree(conf); mddev->private = NULL; MOD_DEC_USE_COUNT; return 0; } static mdk_personality_t raid1_personality= { name: "raid1", make_request: raid1_make_request, run: raid1_run, stop: raid1_stop, status: raid1_status, error_handler: raid1_error, diskop: raid1_diskop, stop_resync: raid1_stop_resync, restart_resync: raid1_restart_resync, sync_request: raid1_sync_request }; static int md__init raid1_init (void) { return register_md_personality (RAID1, &raid1_personality); } static void raid1_exit (void) { unregister_md_personality (RAID1); } module_init(raid1_init); module_exit(raid1_exit); MODULE_LICENSE("GPL");