Lines Matching +full:suspend +full:- +full:to +full:- +full:disk
3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
8 #include "dm-core.h"
9 #include "dm-rq.h"
10 #include "dm-uevent.h"
30 #include <linux/blk-crypto.h>
66 * One of these is allocated (on-stack) per original bio.
111 if (!tio->inside_dm_io) in dm_per_bio_data()
112 return (char *)bio - offsetof(struct dm_target_io, clone) - data_size; in dm_per_bio_data()
113 …return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_siz… in dm_per_bio_data()
120 if (io->magic == DM_IO_MAGIC) in dm_bio_from_per_bio_data()
122 BUG_ON(io->magic != DM_TIO_MAGIC); in dm_bio_from_per_bio_data()
129 return container_of(bio, struct dm_target_io, clone)->target_bio_nr; in dm_bio_get_target_bio_nr()
133 #define MINOR_ALLOCED ((void *)-1)
136 * Bits for the md->flags field.
152 * For mempools pre-allocation at the table loading time.
166 * Bio-based DM's mempools' reserved IOs set by the user.
221 DM_NUMA_NODE, num_online_nodes() - 1); in dm_get_numa_node()
234 r = -ENOMEM; in local_init()
306 while (i--) in dm_init()
316 while (i--) in dm_exit()
330 return test_bit(DMF_DELETING, &md->flags); in dm_deleting_md()
339 md = bdev->bd_disk->private_data; in dm_blk_open()
343 if (test_bit(DMF_FREEING, &md->flags) || in dm_blk_open()
350 atomic_inc(&md->open_count); in dm_blk_open()
354 return md ? 0 : -ENXIO; in dm_blk_open()
357 static void dm_blk_close(struct gendisk *disk, fmode_t mode) in dm_blk_close() argument
363 md = disk->private_data; in dm_blk_close()
367 if (atomic_dec_and_test(&md->open_count) && in dm_blk_close()
368 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) in dm_blk_close()
378 return atomic_read(&md->open_count); in dm_open_count()
391 r = -EBUSY; in dm_lock_for_deletion()
393 set_bit(DMF_DEFERRED_REMOVE, &md->flags); in dm_lock_for_deletion()
394 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) in dm_lock_for_deletion()
395 r = -EEXIST; in dm_lock_for_deletion()
397 set_bit(DMF_DELETING, &md->flags); in dm_lock_for_deletion()
410 if (test_bit(DMF_DELETING, &md->flags)) in dm_cancel_deferred_remove()
411 r = -EBUSY; in dm_cancel_deferred_remove()
413 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); in dm_cancel_deferred_remove()
427 struct mapped_device *md = bdev->bd_disk->private_data; in dm_blk_getgeo()
436 sector_t sector_diff = args->tgt->begin - args->start; in dm_report_zones_cb()
441 if (zone->start >= args->start + args->tgt->len) in dm_report_zones_cb()
446 * to match its position in the target range. in dm_report_zones_cb()
448 zone->start += sector_diff; in dm_report_zones_cb()
449 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { in dm_report_zones_cb()
450 if (zone->cond == BLK_ZONE_COND_FULL) in dm_report_zones_cb()
451 zone->wp = zone->start + zone->len; in dm_report_zones_cb()
452 else if (zone->cond == BLK_ZONE_COND_EMPTY) in dm_report_zones_cb()
453 zone->wp = zone->start; in dm_report_zones_cb()
455 zone->wp += sector_diff; in dm_report_zones_cb()
458 args->next_sector = zone->start + zone->len; in dm_report_zones_cb()
459 return args->orig_cb(zone, args->zone_idx++, args->orig_data); in dm_report_zones_cb()
463 static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, in dm_blk_report_zones() argument
466 struct mapped_device *md = disk->private_data; in dm_blk_report_zones()
476 return -EAGAIN; in dm_blk_report_zones()
480 ret = -EIO; in dm_blk_report_zones()
488 if (WARN_ON_ONCE(!tgt->type->report_zones)) { in dm_blk_report_zones()
489 ret = -EIO; in dm_blk_report_zones()
494 ret = tgt->type->report_zones(tgt, &args, in dm_blk_report_zones()
495 nr_zones - args.zone_idx); in dm_blk_report_zones()
499 args.next_sector < get_capacity(disk)); in dm_blk_report_zones()
518 r = -ENOTTY; in dm_prepare_ioctl()
528 if (!tgt->type->prepare_ioctl) in dm_prepare_ioctl()
532 return -EAGAIN; in dm_prepare_ioctl()
534 r = tgt->type->prepare_ioctl(tgt, bdev); in dm_prepare_ioctl()
535 if (r == -ENOTCONN && !fatal_signal_pending(current)) { in dm_prepare_ioctl()
552 struct mapped_device *md = bdev->bd_disk->private_data; in dm_blk_ioctl()
566 "%s: sending ioctl %x to DM device without required privilege.", in dm_blk_ioctl()
567 current->comm, cmd); in dm_blk_ioctl()
568 r = -ENOIOCTLCMD; in dm_blk_ioctl()
582 struct dm_io *io = tio->io; in dm_start_time_ns_from_clone()
584 return jiffies_to_nsecs(io->start_time); in dm_start_time_ns_from_clone()
590 struct mapped_device *md = io->md; in start_io_acct()
591 struct bio *bio = io->orig_bio; in start_io_acct()
593 io->start_time = bio_start_io_acct(bio); in start_io_acct()
594 if (unlikely(dm_stats_used(&md->stats))) in start_io_acct()
595 dm_stats_account_io(&md->stats, bio_data_dir(bio), in start_io_acct()
596 bio->bi_iter.bi_sector, bio_sectors(bio), in start_io_acct()
597 false, 0, &io->stats_aux); in start_io_acct()
602 struct mapped_device *md = io->md; in end_io_acct()
603 struct bio *bio = io->orig_bio; in end_io_acct()
604 unsigned long duration = jiffies - io->start_time; in end_io_acct()
606 bio_end_io_acct(bio, io->start_time); in end_io_acct()
608 if (unlikely(dm_stats_used(&md->stats))) in end_io_acct()
609 dm_stats_account_io(&md->stats, bio_data_dir(bio), in end_io_acct()
610 bio->bi_iter.bi_sector, bio_sectors(bio), in end_io_acct()
611 true, duration, &io->stats_aux); in end_io_acct()
613 /* nudge anyone waiting on suspend queue */ in end_io_acct()
614 if (unlikely(wq_has_sleeper(&md->wait))) in end_io_acct()
615 wake_up(&md->wait); in end_io_acct()
624 clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs); in alloc_io()
629 tio->inside_dm_io = true; in alloc_io()
630 tio->io = NULL; in alloc_io()
633 io->magic = DM_IO_MAGIC; in alloc_io()
634 io->status = 0; in alloc_io()
635 atomic_set(&io->io_count, 1); in alloc_io()
636 io->orig_bio = bio; in alloc_io()
637 io->md = md; in alloc_io()
638 spin_lock_init(&io->endio_lock); in alloc_io()
647 bio_put(&io->tio.clone); in free_io()
655 if (!ci->io->tio.io) { in alloc_tio()
656 /* the dm_target_io embedded in ci->io is available */ in alloc_tio()
657 tio = &ci->io->tio; in alloc_tio()
659 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs); in alloc_tio()
664 tio->inside_dm_io = false; in alloc_tio()
667 tio->magic = DM_TIO_MAGIC; in alloc_tio()
668 tio->io = ci->io; in alloc_tio()
669 tio->ti = ti; in alloc_tio()
670 tio->target_bio_nr = target_bio_nr; in alloc_tio()
677 if (tio->inside_dm_io) in free_tio()
679 bio_put(&tio->clone); in free_tio()
683 * Add the bio to the list of deferred io.
689 spin_lock_irqsave(&md->deferred_lock, flags); in queue_io()
690 bio_list_add(&md->deferred, bio); in queue_io()
691 spin_unlock_irqrestore(&md->deferred_lock, flags); in queue_io()
692 queue_work(md->wq, &md->work); in queue_io()
697 * function to access the md->map field, and make sure they call
700 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barri… in dm_get_live_table()
702 *srcu_idx = srcu_read_lock(&md->io_barrier); in dm_get_live_table()
704 return srcu_dereference(md->map, &md->io_barrier); in dm_get_live_table()
707 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) in dm_put_live_table()
709 srcu_read_unlock(&md->io_barrier, srcu_idx); in dm_put_live_table()
714 synchronize_srcu(&md->io_barrier); in dm_sync_table()
719 * A fast alternative to dm_get_live_table/dm_put_live_table.
725 return rcu_dereference(md->map); in dm_get_live_table_fast()
733 static char *_dm_claim_ptr = "I belong to device-mapper";
745 BUG_ON(td->dm_dev.bdev); in open_table_device()
747 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr); in open_table_device()
753 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); in open_table_device()
757 td->dm_dev.bdev = bdev; in open_table_device()
758 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); in open_table_device()
767 if (!td->dm_dev.bdev) in close_table_device()
770 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); in close_table_device()
771 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); in close_table_device()
772 put_dax(td->dm_dev.dax_dev); in close_table_device()
773 td->dm_dev.bdev = NULL; in close_table_device()
774 td->dm_dev.dax_dev = NULL; in close_table_device()
783 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) in find_table_device()
795 mutex_lock(&md->table_devices_lock); in dm_get_table_device()
796 td = find_table_device(&md->table_devices, dev, mode); in dm_get_table_device()
798 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); in dm_get_table_device()
800 mutex_unlock(&md->table_devices_lock); in dm_get_table_device()
801 return -ENOMEM; in dm_get_table_device()
804 td->dm_dev.mode = mode; in dm_get_table_device()
805 td->dm_dev.bdev = NULL; in dm_get_table_device()
808 mutex_unlock(&md->table_devices_lock); in dm_get_table_device()
813 format_dev_t(td->dm_dev.name, dev); in dm_get_table_device()
815 refcount_set(&td->count, 1); in dm_get_table_device()
816 list_add(&td->list, &md->table_devices); in dm_get_table_device()
818 refcount_inc(&td->count); in dm_get_table_device()
820 mutex_unlock(&md->table_devices_lock); in dm_get_table_device()
822 *result = &td->dm_dev; in dm_get_table_device()
831 mutex_lock(&md->table_devices_lock); in dm_put_table_device()
832 if (refcount_dec_and_test(&td->count)) { in dm_put_table_device()
834 list_del(&td->list); in dm_put_table_device()
837 mutex_unlock(&md->table_devices_lock); in dm_put_table_device()
849 td->dm_dev.name, refcount_read(&td->count)); in free_table_devices()
859 *geo = md->geometry; in dm_get_geometry()
869 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; in dm_set_geometry()
871 if (geo->start > sz) { in dm_set_geometry()
873 return -EINVAL; in dm_set_geometry()
876 md->geometry = *geo; in dm_set_geometry()
883 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); in __noflush_suspending()
895 struct mapped_device *md = io->md; in dec_pending()
897 /* Push-back supersedes any I/O errors */ in dec_pending()
899 spin_lock_irqsave(&io->endio_lock, flags); in dec_pending()
900 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md))) in dec_pending()
901 io->status = error; in dec_pending()
902 spin_unlock_irqrestore(&io->endio_lock, flags); in dec_pending()
905 if (atomic_dec_and_test(&io->io_count)) { in dec_pending()
906 if (io->status == BLK_STS_DM_REQUEUE) { in dec_pending()
910 spin_lock_irqsave(&md->deferred_lock, flags); in dec_pending()
912 /* NOTE early return due to BLK_STS_DM_REQUEUE below */ in dec_pending()
913 bio_list_add_head(&md->deferred, io->orig_bio); in dec_pending()
915 /* noflush suspend was interrupted. */ in dec_pending()
916 io->status = BLK_STS_IOERR; in dec_pending()
917 spin_unlock_irqrestore(&md->deferred_lock, flags); in dec_pending()
920 io_error = io->status; in dec_pending()
921 bio = io->orig_bio; in dec_pending()
928 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) { in dec_pending()
933 bio->bi_opf &= ~REQ_PREFLUSH; in dec_pending()
938 bio->bi_status = io_error; in dec_pending()
949 limits->max_discard_sectors = 0; in disable_discard()
950 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue); in disable_discard()
958 limits->max_write_same_sectors = 0; in disable_write_same()
966 limits->max_write_zeroes_sectors = 0; in disable_write_zeroes()
971 blk_status_t error = bio->bi_status; in clone_endio()
973 struct dm_io *io = tio->io; in clone_endio()
974 struct mapped_device *md = tio->io->md; in clone_endio()
975 dm_endio_fn endio = tio->ti->type->end_io; in clone_endio()
976 struct bio *orig_bio = io->orig_bio; in clone_endio()
980 !bio->bi_disk->queue->limits.max_discard_sectors) in clone_endio()
983 !bio->bi_disk->queue->limits.max_write_same_sectors) in clone_endio()
986 !bio->bi_disk->queue->limits.max_write_zeroes_sectors) in clone_endio()
991 * For zone-append bios get offset in zone of the written in clone_endio()
992 * sector and add that to the original bio sector pos. in clone_endio()
995 sector_t written_sector = bio->bi_iter.bi_sector; in clone_endio()
996 struct request_queue *q = orig_bio->bi_disk->queue; in clone_endio()
997 u64 mask = (u64)blk_queue_zone_sectors(q) - 1; in clone_endio()
999 orig_bio->bi_iter.bi_sector += written_sector & mask; in clone_endio()
1003 int r = endio(tio->ti, bio, &error); in clone_endio()
1024 * Return maximum size of I/O possible at the supplied sector up to the current
1030 return ti->len - target_offset; in max_io_len_target_boundary()
1040 * Does the target need to split IO even further? in max_io_len()
1041 * - varied (per target) IO splitting is a tenet of DM; this in max_io_len()
1044 * ti->max_io_len to override stacked chunk_sectors. in max_io_len()
1046 if (ti->max_io_len) { in max_io_len()
1047 max_len = blk_max_size_offset(ti->table->md->queue, in max_io_len()
1048 target_offset, ti->max_io_len); in max_io_len()
1061 ti->error = "Maximum size of target IO is too large"; in dm_set_target_max_io_len()
1062 return -EINVAL; in dm_set_target_max_io_len()
1065 ti->max_io_len = (uint32_t) len; in dm_set_target_max_io_len()
1073 __acquires(md->io_barrier) in dm_dax_get_live_target()
1095 long len, ret = -EIO; in dm_dax_direct_access()
1102 if (!ti->type->direct_access) in dm_dax_direct_access()
1108 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); in dm_dax_direct_access()
1149 if (!ti->type->dax_copy_from_iter) { in dm_dax_copy_from_iter()
1153 ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i); in dm_dax_copy_from_iter()
1173 if (!ti->type->dax_copy_to_iter) { in dm_dax_copy_to_iter()
1177 ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i); in dm_dax_copy_to_iter()
1190 int ret = -EIO; in dm_dax_zero_page_range()
1197 if (WARN_ON(!ti->type->dax_zero_page_range)) { in dm_dax_zero_page_range()
1199 * ->zero_page_range() is mandatory dax operation. If we are in dm_dax_zero_page_range()
1204 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages); in dm_dax_zero_page_range()
1216 * dm_accept_partial_bio informs the dm that the target only wants to process
1221 * +--------------------+---------------+-------+
1223 * +--------------------+---------------+-------+
1225 * <-------------- *tio->len_ptr --------------->
1226 * <------- bi_size ------->
1227 * <-- n_sectors -->
1231 * Region 2 is the remaining bio size that the target wants to process.
1232 * (it may be empty if region 1 is non-empty, although there is no reason
1233 * to make it empty)
1234 * The target requires that region 3 is to be sent in the next bio.
1236 * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1243 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; in dm_accept_partial_bio()
1244 BUG_ON(bio->bi_opf & REQ_PREFLUSH); in dm_accept_partial_bio()
1245 BUG_ON(bi_size > *tio->len_ptr); in dm_accept_partial_bio()
1247 *tio->len_ptr -= bi_size - n_sectors; in dm_accept_partial_bio()
1248 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; in dm_accept_partial_bio()
1256 struct bio *clone = &tio->clone; in __map_bio()
1257 struct dm_io *io = tio->io; in __map_bio()
1258 struct dm_target *ti = tio->ti; in __map_bio()
1261 clone->bi_end_io = clone_endio; in __map_bio()
1264 * Map the clone. If r == 0 we don't need to do in __map_bio()
1268 atomic_inc(&io->io_count); in __map_bio()
1269 sector = clone->bi_iter.bi_sector; in __map_bio()
1271 r = ti->type->map(ti, clone); in __map_bio()
1277 trace_block_bio_remap(clone->bi_disk->queue, clone, in __map_bio()
1278 bio_dev(io->orig_bio), sector); in __map_bio()
1299 bio->bi_iter.bi_sector = sector; in bio_setup_sector()
1300 bio->bi_iter.bi_size = to_bytes(len); in bio_setup_sector()
1309 struct bio *clone = &tio->clone; in clone_bio()
1319 if (unlikely(!dm_target_has_integrity(tio->ti->type) && in clone_bio()
1320 !dm_target_passes_integrity(tio->ti->type))) { in clone_bio()
1322 dm_device_name(tio->io->md), in clone_bio()
1323 tio->ti->type->name); in clone_bio()
1324 return -EIO; in clone_bio()
1332 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); in clone_bio()
1333 clone->bi_iter.bi_size = to_bytes(len); in clone_bio()
1352 bio_list_add(blist, &tio->clone); in alloc_multiple_bios()
1361 mutex_lock(&ci->io->md->table_devices_lock); in alloc_multiple_bios()
1367 bio_list_add(blist, &tio->clone); in alloc_multiple_bios()
1370 mutex_unlock(&ci->io->md->table_devices_lock); in alloc_multiple_bios()
1384 struct bio *clone = &tio->clone; in __clone_and_map_simple_bio()
1386 tio->len_ptr = len; in __clone_and_map_simple_bio()
1388 __bio_clone_fast(clone, ci->bio); in __clone_and_map_simple_bio()
1390 bio_setup_sector(clone, ci->sector, *len); in __clone_and_map_simple_bio()
1417 * Use an on-stack bio for this, it's safe since we don't in __send_empty_flush()
1418 * need to reference it after submit. It's just used as in __send_empty_flush()
1423 ci->bio = &flush_bio; in __send_empty_flush()
1424 ci->sector_count = 0; in __send_empty_flush()
1433 bio_set_dev(ci->bio, ci->io->md->bdev); in __send_empty_flush()
1435 BUG_ON(bio_has_data(ci->bio)); in __send_empty_flush()
1436 while ((ti = dm_table_get_target(ci->map, target_nr++))) in __send_empty_flush()
1437 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); in __send_empty_flush()
1439 bio_uninit(ci->bio); in __send_empty_flush()
1446 struct bio *bio = ci->bio; in __clone_and_map_data_bio()
1451 tio->len_ptr = len; in __clone_and_map_data_bio()
1474 return -EOPNOTSUPP; in __send_changing_extent_only()
1476 len = min_t(sector_t, ci->sector_count, in __send_changing_extent_only()
1477 max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); in __send_changing_extent_only()
1481 ci->sector += len; in __send_changing_extent_only()
1482 ci->sector_count -= len; in __send_changing_extent_only()
1506 struct bio *bio = ci->bio; in __process_abnormal_io()
1511 num_bios = ti->num_discard_bios; in __process_abnormal_io()
1514 num_bios = ti->num_secure_erase_bios; in __process_abnormal_io()
1517 num_bios = ti->num_write_same_bios; in __process_abnormal_io()
1520 num_bios = ti->num_write_zeroes_bios; in __process_abnormal_io()
1531 * Select the correct strategy for processing a non-flush bio.
1539 ti = dm_table_find_target(ci->map, ci->sector); in __split_and_process_non_flush()
1541 return -EIO; in __split_and_process_non_flush()
1546 len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); in __split_and_process_non_flush()
1548 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len); in __split_and_process_non_flush()
1552 ci->sector += len; in __split_and_process_non_flush()
1553 ci->sector_count -= len; in __split_and_process_non_flush()
1561 ci->map = map; in init_clone_info()
1562 ci->io = alloc_io(md, bio); in init_clone_info()
1563 ci->sector = bio->bi_iter.bi_sector; in init_clone_info()
1567 (part_stat_get(part, field) -= (subnd))
1570 * Entry point to split a bio into clones and submit them to the targets.
1581 if (bio->bi_opf & REQ_PREFLUSH) { in __split_and_process_bio()
1593 if (current->bio_list && ci.sector_count && !error) { in __split_and_process_bio()
1595 * Remainder must be passed to submit_bio_noacct() in __split_and_process_bio()
1598 * We take a clone of the original to store in in __split_and_process_bio()
1599 * ci.io->orig_bio to be used by end_io_acct() and in __split_and_process_bio()
1600 * for dec_pending to use for completion handling. in __split_and_process_bio()
1602 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count, in __split_and_process_bio()
1603 GFP_NOIO, &md->queue->bio_split); in __split_and_process_bio()
1604 ci.io->orig_bio = b; in __split_and_process_bio()
1609 * NOTE: this is a stop-gap fix, a proper fix involves in __split_and_process_bio()
1614 __dm_part_stat_sub(&dm_disk(md)->part0, in __split_and_process_bio()
1619 trace_block_split(md->queue, b, bio->bi_iter.bi_sector); in __split_and_process_bio()
1633 struct mapped_device *md = bio->bi_disk->private_data; in dm_submit_bio()
1647 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { in dm_submit_bio()
1648 if (bio->bi_opf & REQ_NOWAIT) in dm_submit_bio()
1650 else if (bio->bi_opf & REQ_RAHEAD) in dm_submit_bio()
1670 /*-----------------------------------------------------------------
1671 * An IDR is used to keep track of allocated minor numbers.
1672 *---------------------------------------------------------------*/
1688 return -EINVAL; in specific_minor()
1698 return r == -ENOSPC ? -EBUSY : r; in specific_minor()
1727 if (md->wq) in cleanup_mapped_device()
1728 destroy_workqueue(md->wq); in cleanup_mapped_device()
1729 bioset_exit(&md->bs); in cleanup_mapped_device()
1730 bioset_exit(&md->io_bs); in cleanup_mapped_device()
1732 if (md->dax_dev) { in cleanup_mapped_device()
1733 kill_dax(md->dax_dev); in cleanup_mapped_device()
1734 put_dax(md->dax_dev); in cleanup_mapped_device()
1735 md->dax_dev = NULL; in cleanup_mapped_device()
1738 if (md->disk) { in cleanup_mapped_device()
1740 md->disk->private_data = NULL; in cleanup_mapped_device()
1742 del_gendisk(md->disk); in cleanup_mapped_device()
1743 put_disk(md->disk); in cleanup_mapped_device()
1746 if (md->queue) in cleanup_mapped_device()
1747 blk_cleanup_queue(md->queue); in cleanup_mapped_device()
1749 cleanup_srcu_struct(&md->io_barrier); in cleanup_mapped_device()
1751 if (md->bdev) { in cleanup_mapped_device()
1752 bdput(md->bdev); in cleanup_mapped_device()
1753 md->bdev = NULL; in cleanup_mapped_device()
1756 mutex_destroy(&md->suspend_lock); in cleanup_mapped_device()
1757 mutex_destroy(&md->type_lock); in cleanup_mapped_device()
1758 mutex_destroy(&md->table_devices_lock); in cleanup_mapped_device()
1774 DMWARN("unable to allocate device, out of memory."); in alloc_dev()
1789 r = init_srcu_struct(&md->io_barrier); in alloc_dev()
1793 md->numa_node_id = numa_node_id; in alloc_dev()
1794 md->init_tio_pdu = false; in alloc_dev()
1795 md->type = DM_TYPE_NONE; in alloc_dev()
1796 mutex_init(&md->suspend_lock); in alloc_dev()
1797 mutex_init(&md->type_lock); in alloc_dev()
1798 mutex_init(&md->table_devices_lock); in alloc_dev()
1799 spin_lock_init(&md->deferred_lock); in alloc_dev()
1800 atomic_set(&md->holders, 1); in alloc_dev()
1801 atomic_set(&md->open_count, 0); in alloc_dev()
1802 atomic_set(&md->event_nr, 0); in alloc_dev()
1803 atomic_set(&md->uevent_seq, 0); in alloc_dev()
1804 INIT_LIST_HEAD(&md->uevent_list); in alloc_dev()
1805 INIT_LIST_HEAD(&md->table_devices); in alloc_dev()
1806 spin_lock_init(&md->uevent_lock); in alloc_dev()
1809 * default to bio-based until DM table is loaded and md->type in alloc_dev()
1810 * established. If request-based table is loaded: blk-mq will in alloc_dev()
1813 md->queue = blk_alloc_queue(numa_node_id); in alloc_dev()
1814 if (!md->queue) in alloc_dev()
1817 md->disk = alloc_disk_node(1, md->numa_node_id); in alloc_dev()
1818 if (!md->disk) in alloc_dev()
1821 init_waitqueue_head(&md->wait); in alloc_dev()
1822 INIT_WORK(&md->work, dm_wq_work); in alloc_dev()
1823 init_waitqueue_head(&md->eventq); in alloc_dev()
1824 init_completion(&md->kobj_holder.completion); in alloc_dev()
1826 md->disk->major = _major; in alloc_dev()
1827 md->disk->first_minor = minor; in alloc_dev()
1828 md->disk->fops = &dm_blk_dops; in alloc_dev()
1829 md->disk->queue = md->queue; in alloc_dev()
1830 md->disk->private_data = md; in alloc_dev()
1831 sprintf(md->disk->disk_name, "dm-%d", minor); in alloc_dev()
1834 md->dax_dev = alloc_dax(md, md->disk->disk_name, in alloc_dev()
1836 if (IS_ERR(md->dax_dev)) in alloc_dev()
1840 add_disk_no_queue_reg(md->disk); in alloc_dev()
1841 format_dev_t(md->name, MKDEV(_major, minor)); in alloc_dev()
1843 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); in alloc_dev()
1844 if (!md->wq) in alloc_dev()
1847 md->bdev = bdget_disk(md->disk, 0); in alloc_dev()
1848 if (!md->bdev) in alloc_dev()
1851 dm_stats_init(&md->stats); in alloc_dev()
1877 int minor = MINOR(disk_devt(md->disk)); in free_dev()
1883 free_table_devices(&md->table_devices); in free_dev()
1884 dm_stats_cleanup(&md->stats); in free_dev()
1902 bioset_exit(&md->bs); in __bind_mempools()
1903 bioset_exit(&md->io_bs); in __bind_mempools()
1905 } else if (bioset_initialized(&md->bs)) { in __bind_mempools()
1907 * There's no need to reload with request-based dm in __bind_mempools()
1909 * Note for future: If you are to reload bioset, in __bind_mempools()
1910 * prep-ed requests in the queue may refer in __bind_mempools()
1911 * to bio from the old bioset, so you must walk in __bind_mempools()
1912 * through the queue to unprep. in __bind_mempools()
1918 bioset_initialized(&md->bs) || in __bind_mempools()
1919 bioset_initialized(&md->io_bs)); in __bind_mempools()
1921 ret = bioset_init_from_src(&md->bs, &p->bs); in __bind_mempools()
1924 ret = bioset_init_from_src(&md->io_bs, &p->io_bs); in __bind_mempools()
1926 bioset_exit(&md->bs); in __bind_mempools()
1934 * Bind a table to the device.
1942 spin_lock_irqsave(&md->uevent_lock, flags); in event_callback()
1943 list_splice_init(&md->uevent_list, &uevents); in event_callback()
1944 spin_unlock_irqrestore(&md->uevent_lock, flags); in event_callback()
1946 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); in event_callback()
1948 atomic_inc(&md->event_nr); in event_callback()
1949 wake_up(&md->eventq); in event_callback()
1960 struct request_queue *q = md->queue; in __bind()
1965 lockdep_assert_held(&md->suspend_lock); in __bind()
1973 memset(&md->geometry, 0, sizeof(md->geometry)); in __bind()
1975 set_capacity(md->disk, size); in __bind()
1976 bd_set_nr_sectors(md->bdev, size); in __bind()
1982 * for request-based during suspension. So stop it to prevent in __bind()
1985 * because request-based dm may be run just after the setting. in __bind()
1992 * Leverage the fact that request-based DM targets are in __bind()
1993 * immutable singletons - used to optimize dm_mq_queue_rq. in __bind()
1995 md->immutable_target = dm_table_get_immutable_target(t); in __bind()
2004 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); in __bind()
2005 rcu_assign_pointer(md->map, (void *)t); in __bind()
2006 md->immutable_target_type = dm_table_get_immutable_target_type(t); in __bind()
2017 * Returns unbound table for the caller to free.
2021 struct dm_table *map = rcu_dereference_protected(md->map, 1); in __unbind()
2027 RCU_INIT_POINTER(md->map, NULL); in __unbind()
2043 return -ENXIO; in dm_create()
2056 * Functions to manage md->type.
2057 * All are required to hold md->type_lock.
2061 mutex_lock(&md->type_lock); in dm_lock_md_type()
2066 mutex_unlock(&md->type_lock); in dm_unlock_md_type()
2071 BUG_ON(!mutex_is_locked(&md->type_lock)); in dm_set_md_type()
2072 md->type = type; in dm_set_md_type()
2077 return md->type; in dm_get_md_type()
2082 return md->immutable_target_type; in dm_get_immutable_target_type()
2091 BUG_ON(!atomic_read(&md->holders)); in dm_get_queue_limits()
2092 return &md->queue->limits; in dm_get_queue_limits()
2107 md->disk->fops = &dm_rq_blk_dops; in dm_setup_md_queue()
2110 DMERR("Cannot initialize queue for request-based dm mapped device"); in dm_setup_md_queue()
2127 dm_table_set_restrictions(t, md->queue, &limits); in dm_setup_md_queue()
2128 blk_register_queue(md->disk); in dm_setup_md_queue()
2145 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { in dm_get_md()
2159 return md->interface_ptr; in dm_get_mdptr()
2164 md->interface_ptr = ptr; in dm_set_mdptr()
2169 atomic_inc(&md->holders); in dm_get()
2170 BUG_ON(test_bit(DMF_FREEING, &md->flags)); in dm_get()
2176 if (test_bit(DMF_FREEING, &md->flags)) { in dm_hold()
2178 return -EBUSY; in dm_hold()
2188 return md->name; in dm_device_name()
2201 set_bit(DMF_FREEING, &md->flags); in __dm_destroy()
2204 blk_set_queue_dying(md->queue); in __dm_destroy()
2208 * do not race with internal suspend. in __dm_destroy()
2210 mutex_lock(&md->suspend_lock); in __dm_destroy()
2214 set_bit(DMF_SUSPENDED, &md->flags); in __dm_destroy()
2215 set_bit(DMF_POST_SUSPENDING, &md->flags); in __dm_destroy()
2220 mutex_unlock(&md->suspend_lock); in __dm_destroy()
2223 * Rare, but there may be I/O requests still going to complete, in __dm_destroy()
2224 * for example. Wait for all references to disappear. in __dm_destroy()
2229 while (atomic_read(&md->holders)) in __dm_destroy()
2231 else if (atomic_read(&md->holders)) in __dm_destroy()
2233 dm_device_name(md), atomic_read(&md->holders)); in __dm_destroy()
2252 atomic_dec(&md->holders); in dm_put()
2259 struct hd_struct *part = &dm_disk(md)->part0; in md_in_flight_bios()
2276 prepare_to_wait(&md->wait, &wait, task_state); in dm_wait_for_bios_completion()
2282 r = -EINTR; in dm_wait_for_bios_completion()
2288 finish_wait(&md->wait, &wait); in dm_wait_for_bios_completion()
2297 if (!queue_is_mq(md->queue)) in dm_wait_for_completion()
2301 if (!blk_mq_queue_inflight(md->queue)) in dm_wait_for_completion()
2305 r = -EINTR; in dm_wait_for_completion()
2323 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { in dm_wq_work()
2324 spin_lock_irq(&md->deferred_lock); in dm_wq_work()
2325 bio = bio_list_pop(&md->deferred); in dm_wq_work()
2326 spin_unlock_irq(&md->deferred_lock); in dm_wq_work()
2337 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); in dm_queue_flush()
2339 queue_work(md->wq, &md->work); in dm_queue_flush()
2343 * Swap in a new table, returning the old one for the caller to destroy.
2347 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); in dm_swap_table()
2351 mutex_lock(&md->suspend_lock); in dm_swap_table()
2366 limits = md->queue->limits; in dm_swap_table()
2382 mutex_unlock(&md->suspend_lock); in dm_swap_table()
2387 * Functions to lock and unlock any filesystem running on the
2394 WARN_ON(md->frozen_sb); in lock_fs()
2396 md->frozen_sb = freeze_bdev(md->bdev); in lock_fs()
2397 if (IS_ERR(md->frozen_sb)) { in lock_fs()
2398 r = PTR_ERR(md->frozen_sb); in lock_fs()
2399 md->frozen_sb = NULL; in lock_fs()
2403 set_bit(DMF_FROZEN, &md->flags); in lock_fs()
2410 if (!test_bit(DMF_FROZEN, &md->flags)) in unlock_fs()
2413 thaw_bdev(md->bdev, md->frozen_sb); in unlock_fs()
2414 md->frozen_sb = NULL; in unlock_fs()
2415 clear_bit(DMF_FROZEN, &md->flags); in unlock_fs()
2424 * now. There is no request-processing activity. All new requests
2425 * are being added to md->deferred list.
2435 lockdep_assert_held(&md->suspend_lock); in __dm_suspend()
2442 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); in __dm_suspend()
2453 * Flush I/O to the device. in __dm_suspend()
2456 * (lock_fs() flushes I/Os and waits for them to complete.) in __dm_suspend()
2468 * to target drivers i.e. no one may be executing in __dm_suspend()
2471 * To get all processes out of __split_and_process_bio in dm_submit_bio, in __dm_suspend()
2472 * we take the write lock. To prevent any process from reentering in __dm_suspend()
2475 * flush_workqueue(md->wq). in __dm_suspend()
2477 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); in __dm_suspend()
2479 synchronize_srcu(&md->io_barrier); in __dm_suspend()
2482 * Stop md->queue before flushing md->wq in case request-based in __dm_suspend()
2483 * dm defers requests to md->wq from md->queue. in __dm_suspend()
2486 dm_stop_queue(md->queue); in __dm_suspend()
2488 flush_workqueue(md->wq); in __dm_suspend()
2492 * We call dm_wait_for_completion to wait for all existing requests in __dm_suspend()
2493 * to finish. in __dm_suspend()
2497 set_bit(dmf_suspended_flag, &md->flags); in __dm_suspend()
2500 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); in __dm_suspend()
2502 synchronize_srcu(&md->io_barrier); in __dm_suspend()
2509 dm_start_queue(md->queue); in __dm_suspend()
2520 * We need to be able to change a mapping table under a mounted
2521 * filesystem. For example we might want to move some data in
2523 * dm_bind_table, dm_suspend must be called to flush any in
2527 * Suspend mechanism in request-based dm.
2531 * 3. Wait for all in-flight I/Os to be completed or requeued.
2533 * To abort suspend, start the request_queue.
2541 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); in dm_suspend()
2544 r = -EINVAL; in dm_suspend()
2550 mutex_unlock(&md->suspend_lock); in dm_suspend()
2551 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); in dm_suspend()
2557 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); in dm_suspend()
2563 set_bit(DMF_POST_SUSPENDING, &md->flags); in dm_suspend()
2565 clear_bit(DMF_POST_SUSPENDING, &md->flags); in dm_suspend()
2568 mutex_unlock(&md->suspend_lock); in dm_suspend()
2585 * Request-based dm is queueing the deferred I/Os in its request_queue. in __dm_resume()
2588 dm_start_queue(md->queue); in __dm_resume()
2601 r = -EINVAL; in dm_resume()
2602 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); in dm_resume()
2609 mutex_unlock(&md->suspend_lock); in dm_resume()
2610 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); in dm_resume()
2616 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); in dm_resume()
2624 clear_bit(DMF_SUSPENDED, &md->flags); in dm_resume()
2626 mutex_unlock(&md->suspend_lock); in dm_resume()
2632 * Internal suspend/resume works like userspace-driven suspend. It waits
2633 * until all bios finish and prevents issuing new bios to the target drivers.
2641 lockdep_assert_held(&md->suspend_lock); in __dm_internal_suspend()
2643 if (md->internal_suspend_count++) in __dm_internal_suspend()
2644 return; /* nested internal suspend */ in __dm_internal_suspend()
2647 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); in __dm_internal_suspend()
2648 return; /* nest suspend */ in __dm_internal_suspend()
2651 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); in __dm_internal_suspend()
2654 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is in __dm_internal_suspend()
2655 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend in __dm_internal_suspend()
2656 * would require changing .presuspend to return an error -- avoid this in __dm_internal_suspend()
2657 * until there is a need for more elaborate variants of internal suspend. in __dm_internal_suspend()
2662 set_bit(DMF_POST_SUSPENDING, &md->flags); in __dm_internal_suspend()
2664 clear_bit(DMF_POST_SUSPENDING, &md->flags); in __dm_internal_suspend()
2669 BUG_ON(!md->internal_suspend_count); in __dm_internal_resume()
2671 if (--md->internal_suspend_count) in __dm_internal_resume()
2672 return; /* resume from nested internal suspend */ in __dm_internal_resume()
2675 goto done; /* resume from nested suspend */ in __dm_internal_resume()
2678 * NOTE: existing callers don't need to call dm_table_resume_targets in __dm_internal_resume()
2679 * (which may fail -- so best to avoid it for now by passing NULL map) in __dm_internal_resume()
2684 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); in __dm_internal_resume()
2686 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); in __dm_internal_resume()
2691 mutex_lock(&md->suspend_lock); in dm_internal_suspend_noflush()
2693 mutex_unlock(&md->suspend_lock); in dm_internal_suspend_noflush()
2699 mutex_lock(&md->suspend_lock); in dm_internal_resume()
2701 mutex_unlock(&md->suspend_lock); in dm_internal_resume()
2706 * Fast variants of internal suspend/resume hold md->suspend_lock,
2707 * which prevents interaction with userspace-driven suspend.
2712 mutex_lock(&md->suspend_lock); in dm_internal_suspend_fast()
2716 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); in dm_internal_suspend_fast()
2717 synchronize_srcu(&md->io_barrier); in dm_internal_suspend_fast()
2718 flush_workqueue(md->wq); in dm_internal_suspend_fast()
2731 mutex_unlock(&md->suspend_lock); in dm_internal_resume_fast()
2735 /*-----------------------------------------------------------------
2737 *---------------------------------------------------------------*/
2749 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action); in dm_kobject_uevent()
2753 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, in dm_kobject_uevent()
2764 return atomic_add_return(1, &md->uevent_seq); in dm_next_uevent_seq()
2769 return atomic_read(&md->event_nr); in dm_get_event_nr()
2774 return wait_event_interruptible(md->eventq, in dm_wait_event()
2775 (event_nr != atomic_read(&md->event_nr))); in dm_wait_event()
2782 spin_lock_irqsave(&md->uevent_lock, flags); in dm_uevent_add()
2783 list_add(elist, &md->uevent_list); in dm_uevent_add()
2784 spin_unlock_irqrestore(&md->uevent_lock, flags); in dm_uevent_add()
2793 return md->disk; in dm_disk()
2799 return &md->kobj_holder.kobj; in dm_kobject()
2809 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { in dm_get_from_kobject()
2822 return test_bit(DMF_SUSPENDED, &md->flags); in dm_suspended_md()
2827 return test_bit(DMF_POST_SUSPENDING, &md->flags); in dm_post_suspending_md()
2832 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); in dm_suspended_internally_md()
2837 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); in dm_test_deferred_remove_flag()
2842 return dm_suspended_md(ti->table->md); in dm_suspended()
2848 return dm_post_suspending_md(ti->table->md); in dm_post_suspending()
2854 return __noflush_suspending(ti->table->md); in dm_noflush_suspending()
2862 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); in dm_alloc_md_mempools()
2876 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0); in dm_alloc_md_mempools()
2879 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size)) in dm_alloc_md_mempools()
2885 /* per_io_data_size is used for blk-mq pdu at queue allocation */ in dm_alloc_md_mempools()
2891 ret = bioset_init(&pools->bs, pool_size, front_pad, 0); in dm_alloc_md_mempools()
2895 if (integrity && bioset_integrity_create(&pools->bs, pool_size)) in dm_alloc_md_mempools()
2911 bioset_exit(&pools->bs); in dm_free_md_mempools()
2912 bioset_exit(&pools->io_bs); in dm_free_md_mempools()
2927 struct mapped_device *md = bdev->bd_disk->private_data; in dm_call_pr()
2930 int ret = -ENOTTY, srcu_idx; in dm_call_pr()
2941 ret = -EINVAL; in dm_call_pr()
2942 if (!ti->type->iterate_devices) in dm_call_pr()
2945 ret = ti->type->iterate_devices(ti, fn, data); in dm_call_pr()
2952 * For register / unregister we need to manually call out to every path.
2958 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; in __dm_pr_register()
2960 if (!ops || !ops->pr_register) in __dm_pr_register()
2961 return -EOPNOTSUPP; in __dm_pr_register()
2962 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); in __dm_pr_register()
2978 /* unregister all paths if we failed to register any path */ in dm_pr_register()
2992 struct mapped_device *md = bdev->bd_disk->private_data; in dm_pr_reserve()
3000 ops = bdev->bd_disk->fops->pr_ops; in dm_pr_reserve()
3001 if (ops && ops->pr_reserve) in dm_pr_reserve()
3002 r = ops->pr_reserve(bdev, key, type, flags); in dm_pr_reserve()
3004 r = -EOPNOTSUPP; in dm_pr_reserve()
3012 struct mapped_device *md = bdev->bd_disk->private_data; in dm_pr_release()
3020 ops = bdev->bd_disk->fops->pr_ops; in dm_pr_release()
3021 if (ops && ops->pr_release) in dm_pr_release()
3022 r = ops->pr_release(bdev, key, type); in dm_pr_release()
3024 r = -EOPNOTSUPP; in dm_pr_release()
3033 struct mapped_device *md = bdev->bd_disk->private_data; in dm_pr_preempt()
3041 ops = bdev->bd_disk->fops->pr_ops; in dm_pr_preempt()
3042 if (ops && ops->pr_preempt) in dm_pr_preempt()
3043 r = ops->pr_preempt(bdev, old_key, new_key, type, abort); in dm_pr_preempt()
3045 r = -EOPNOTSUPP; in dm_pr_preempt()
3053 struct mapped_device *md = bdev->bd_disk->private_data; in dm_pr_clear()
3061 ops = bdev->bd_disk->fops->pr_ops; in dm_pr_clear()
3062 if (ops && ops->pr_clear) in dm_pr_clear()
3063 r = ops->pr_clear(bdev, key); in dm_pr_clear()
3065 r = -EOPNOTSUPP; in dm_pr_clear()
3117 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3123 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");