linux-block

2024-12-02 来源：个人技术集锦

1 struct bio 表示一次块设备的IO请求。
2 struct request是bio(elevator)提交给IO调度器后产生的数据，一个request中存放着顺序排列的bio。当设备提交bio 给IO调度器时，新的bio可能被合并到request_queue中已有的request结构中(甚至合并到已有的bio中)，或者生成新的request。request表示等待处理的块设备IO请求，结构里有sector等信息；通过queuelist把自己挂在它依附的request_queue上。
3 struct request_queue每个物理设备对应一个，是request所形成的list，其结构中有一些函数指针。

一主要数据结构

1 gendisk

代表一个通用块设备，每个磁盘对应个gendisk结构，包括该disk上的请求队列，分区列表/第一个分区，块设备操作表等重要信息。

struct gendisk {
	/* major, first_minor and minors are input parameters only,
	 * don't use directly.  Use disk_devt() and disk_max_parts().
	 */
	int major;			/* major number of driver */
	int first_minor;
	int minors;                     /* maximum number of minors, =1 for
                                         * disks that can't be partitioned. */

	char disk_name[DISK_NAME_LEN];	/* name of major driver */
	char *(*devnode)(struct gendisk *gd, umode_t *mode);

	unsigned int events;		/* supported events */
	unsigned int async_events;	/* async events, subset of all */

	/* Array of pointers to partitions indexed by partno.
	 * Protected with matching bdev lock but stat and other
	 * non-critical accesses use RCU.  Always access through
	 * helpers.
	 */
	struct disk_part_tbl __rcu *part_tbl;
	struct hd_struct part0;

	const struct block_device_operations *fops;
	struct request_queue *queue;
	void *private_data;

	int flags;
	struct device *driverfs_dev;  // FIXME: remove
	struct kobject *slave_dir;

	struct timer_rand_state *random;
	atomic_t sync_io;		/* RAID */
	struct disk_events *ev;
#ifdef  CONFIG_BLK_DEV_INTEGRITY
	struct blk_integrity *integrity;
#endif
	int node_id;
};

major:块设备的主设备号。
first_minor:起始次设备号。
minors:描述了该块设备有多少个次设备号，或者说有多少个分区，如果minors为1，则表示该块设备没有分区。
part_tbl:整个块设备的分区信息都包含在里面，其核心结构是一个struct hd_struct的指针数组，每一项都指向一个描述分区的hd_struct结构。
fops:指向特定于设备的底层操作函数集。
queue:块设备的请求队列，所有针对该设备的请求都会放入该请求队列中，经过I/O scheduler的处理再进行提交。

2 hd_struct

保存一个分区信息，包括起始扇区，扇区数，分区号等基本信息。

struct hd_struct {
	sector_t start_sect;
	/*
	 * nr_sects is protected by sequence counter. One might extend a
	 * partition while IO is happening to it and update of nr_sects
	 * can be non-atomic on 32bit machines with 64bit sector_t.
	 */
	sector_t nr_sects;
	seqcount_t nr_sects_seq;
	sector_t alignment_offset;
	unsigned int discard_alignment;
	struct device __dev;
	struct kobject *holder_dir;
	int policy, partno;
	struct partition_meta_info *info;
#ifdef CONFIG_FAIL_MAKE_REQUEST
	int make_it_fail;
#endif
	unsigned long stamp;
	atomic_t in_flight[2];
#ifdef	CONFIG_SMP
	struct disk_stats __percpu *dkstats;
#else
	struct disk_stats dkstats;
#endif
	atomic_t ref;
	struct rcu_head rcu_head;
};

3 disk_part_tbl

磁盘的分区表。

struct disk_part_tbl {
	struct rcu_head rcu_head;
	int len;
	struct hd_struct __rcu *last_lookup;
	struct hd_struct __rcu *part[];
};

4 block_device

可以是整个磁盘，也可以是一个分区。如果是一个分区块设备，则bd_contains会指向分区所在磁盘的block_device，bd_part则指向分区信息结构hd_struct。

struct block_device {
	dev_t			bd_dev;  /* not a kdev_t - it's a search key */
	int			bd_openers;
	struct inode *		bd_inode;	/* will die */
	struct super_block *	bd_super;
	struct mutex		bd_mutex;	/* open/close mutex */
	struct list_head	bd_inodes;
	void *			bd_claiming;
	void *			bd_holder;
	int			bd_holders;
	bool			bd_write_holder;
#ifdef CONFIG_SYSFS
	struct list_head	bd_holder_disks;
#endif
	struct block_device *	bd_contains;
	unsigned		bd_block_size;
	struct hd_struct *	bd_part;
	/* number of times partitions within this device have been opened. */
	unsigned		bd_part_count;
	int			bd_invalidated;
	struct gendisk *	bd_disk;
	struct request_queue *  bd_queue;
	struct list_head	bd_list;
	/*
	 * Private data.  You must have bd_claim'ed the block_device
	 * to use this.  NOTE:  bd_claim allows an owner to claim
	 * the same device multiple times, the owner must take special
	 * care to not mess up bd_private for that case.
	 */
	unsigned long		bd_private;

	/* The counter of freeze processes */
	int			bd_fsfreeze_count;
	/* Mutex for freeze */
	struct mutex		bd_fsfreeze_mutex;
};

bd_dev:该设备(分区)的设备号。
bd_inode:指向该设备文件的inode。
bd_openers:一个引用计数，记录了该块设备打开的次数，或者说有多少个进程打开了该设备。
bd_contains:如果该block_device描述的是一个分区，则该变量指向描述主块设备的block_device，反之，其指向本身。
bd_part:如果该block_device描述的是一个分区，则该变量指向分区的信息。
bd_part_count:如果是分区，该变量记录了分区被打开的次数，在进行分区的重新扫描前，要保证该计数值为0。
bd_disk:指向描述整个设备的gendisk结构。

5 buffer_head

在内核层对块设备的IO请求是以块为单位的。buffer_head是一个块在内存中的元数据信息。b_data指向该块数据的实际地址。b_this_page则将通过page中的块连接起来。以前版本的buffer_head是fs到block device的io请求单元，现在已经改为bio了。

struct buffer_head {
	unsigned long b_state;		/* buffer state bitmap (see above) */
	struct buffer_head *b_this_page;/* circular list of page's buffers */
	struct page *b_page;		/* the page this bh is mapped to */

	sector_t b_blocknr;		/* start block number */
	size_t b_size;			/* size of mapping */
	char *b_data;			/* pointer to data within the page */

	struct block_device *b_bdev;
	bh_end_io_t *b_end_io;		/* I/O completion */
 	void *b_private;		/* reserved for b_end_io */
	struct list_head b_assoc_buffers; /* associated with another mapping */
	struct address_space *b_assoc_map;	/* mapping this buffer is
						   associated with */
	atomic_t b_count;		/* users using this buffer_head */
};

6 bio

bio封装了一次实际的块设备io请求。这是块设备io请求的基本单位。bi_vcnt表示bio_vec的数目。

struct bio {
	sector_t		bi_sector;	/* device address in 512 byte
						   sectors */
	struct bio		*bi_next;	/* request queue link */
	struct block_device	*bi_bdev;
	unsigned long		bi_flags;	/* status, command, etc */
	unsigned long		bi_rw;		/* bottom bits READ/WRITE,
						 * top bits priority
						 */

	unsigned short		bi_vcnt;	/* how many bio_vec's */
	unsigned short		bi_idx;		/* current index into bvl_vec */

	/* Number of segments in this BIO after
	 * physical address coalescing is performed.
	 */
	unsigned int		bi_phys_segments;

	unsigned int		bi_size;	/* residual I/O count */

	/*
	 * To keep track of the max segment size, we account for the
	 * sizes of the first and last mergeable segments in this bio.
	 */
	unsigned int		bi_seg_front_size;
	unsigned int		bi_seg_back_size;

	bio_end_io_t		*bi_end_io;

	void			*bi_private;
#ifdef CONFIG_BLK_CGROUP
	/*
	 * Optional ioc and css associated with this bio.  Put on bio
	 * release.  Read comment on top of bio_associate_current().
	 */
	struct io_context	*bi_ioc;
	struct cgroup_subsys_state *bi_css;
#endif
#if defined(CONFIG_BLK_DEV_INTEGRITY)
	struct bio_integrity_payload *bi_integrity;  /* data integrity */
#endif

	/*
	 * Everything starting with bi_max_vecs will be preserved by bio_reset()
	 */

	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */

	atomic_t		bi_cnt;		/* pin count */

	struct bio_vec		*bi_io_vec;	/* the actual vec list */

	struct bio_set		*bi_pool;

	/*
	 * We can inline a number of vecs at the end of the bio, to avoid
	 * double allocations for a small number of bio_vecs. This member
	 * MUST obviously be kept at the very end of the bio.
	 */
	struct bio_vec		bi_inline_vecs[0];
};

bi_sector:该I/O操作的起始扇区号。
bi_rw:指明了读写方向。
bi_vcnt:该I/O操作中涉及到了多少个缓存向量，每个缓存向量由[page,offset,len]来描述。
bi_idx:指示当前的缓存向量。
bi_io_vec：缓存向量数组。

7 bio_vec

bio_vec表示一次bio涉及到的数据片段(segment)，由所在内存页地址，长度，偏移地址等定位。一次bio一般包含多个segment。

struct bio_vec {
	struct page	*bv_page;
	unsigned int	bv_len;
	unsigned int	bv_offset;
};

8 request

块设备层IO等待请求(pending I/O request)。内核中的bio请求在经过io调度排序后进入块设备层，会尝试合并到已有的request。bio结构中的bi_next将队列中的bio请求串成一个队列。bio/biotail域指向队列的首尾。

struct request {
	struct list_head queuelist;
	struct call_single_data csd;

	struct request_queue *q;

	unsigned int cmd_flags;
	enum rq_cmd_type_bits cmd_type;
	unsigned long atomic_flags;

	int cpu;

	/* the following two fields are internal, NEVER access directly */
	unsigned int __data_len;	/* total data len */
	sector_t __sector;		/* sector cursor */

	struct bio *bio;
	struct bio *biotail;

	struct hlist_node hash;	/* merge hash */
	/*
	 * The rb_node is only used inside the io scheduler, requests
	 * are pruned when moved to the dispatch queue. So let the
	 * completion_data share space with the rb_node.
	 */
	union {
		struct rb_node rb_node;	/* sort/lookup */
		void *completion_data;
	};

	/*
	 * Three pointers are available for the IO schedulers, if they need
	 * more they have to dynamically allocate it.  Flush requests are
	 * never put on the IO scheduler. So let the flush fields share
	 * space with the elevator data.
	 */
	union {
		struct {
			struct io_cq		*icq;
			void			*priv[2];
		} elv;

		struct {
			unsigned int		seq;
			struct list_head	list;
			rq_end_io_fn		*saved_end_io;
		} flush;
	};

	struct gendisk *rq_disk;
	struct hd_struct *part;
	unsigned long start_time;
#ifdef CONFIG_BLK_CGROUP
	struct request_list *rl;		/* rl this rq is alloced from */
	unsigned long long start_time_ns;
	unsigned long long io_start_time_ns;    /* when passed to hardware */
#endif
	/* Number of scatter-gather DMA addr+len pairs after
	 * physical address coalescing is performed.
	 */
	unsigned short nr_phys_segments;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
	unsigned short nr_integrity_segments;
#endif

	unsigned short ioprio;

	int ref_count;

	void *special;		/* opaque pointer available for LLD use */
	char *buffer;		/* kaddr of the current segment if available */

	int tag;
	int errors;

	/*
	 * when request is used as a packet command carrier
	 */
	unsigned char __cmd[BLK_MAX_CDB];
	unsigned char *cmd;
	unsigned short cmd_len;

	unsigned int extra_len;	/* length of alignment and padding */
	unsigned int sense_len;
	unsigned int resid_len;	/* residual count */
	void *sense;

	unsigned long deadline;
	struct list_head timeout_list;
	unsigned int timeout;
	int retries;

	/*
	 * completion callback.
	 */
	rq_end_io_fn *end_io;
	void *end_io_data;

	/* for bidi */
	struct request *next_rq;
};

queuelist:用于将request链入请求队列的链表元素。
q:指向所属的请求队列。
__sector:下一个要传输的bio的起始扇区号。
__data_len:request要传输的数据字节数。
bio,biotail:用于维护request中的bio链表。

9 request_queue

request_queue维护块设备层IO请求队列，队列中包含多个request。request_queue同时定义了处理队列的函数接口，不同的设备注册时需要实现这些IO处理接口。

struct request_queue {
	/*
	 * Together with queue_head for cacheline sharing
	 */
	struct list_head	queue_head;
	struct request		*last_merge;
	struct elevator_queue	*elevator;
	int			nr_rqs[2];	/* # allocated [a]sync rqs */
	int			nr_rqs_elvpriv;	/* # allocated rqs w/ elvpriv */

	/*
	 * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
	 * is used, root blkg allocates from @q->root_rl and all other
	 * blkgs from their own blkg->rl.  Which one to use should be
	 * determined using bio_request_list().
	 */
	struct request_list	root_rl;

	request_fn_proc		*request_fn;
	make_request_fn		*make_request_fn;
	prep_rq_fn		*prep_rq_fn;
	unprep_rq_fn		*unprep_rq_fn;
	merge_bvec_fn		*merge_bvec_fn;
	softirq_done_fn		*softirq_done_fn;
	rq_timed_out_fn		*rq_timed_out_fn;
	dma_drain_needed_fn	*dma_drain_needed;
	lld_busy_fn		*lld_busy_fn;

	/*
	 * Dispatch queue sorting
	 */
	sector_t		end_sector;
	struct request		*boundary_rq;

	/*
	 * Delayed queue handling
	 */
	struct delayed_work	delay_work;

	struct backing_dev_info	backing_dev_info;

	/*
	 * The queue owner gets to use this for whatever they like.
	 * ll_rw_blk doesn't touch it.
	 */
	void			*queuedata;

	/*
	 * various queue flags, see QUEUE_* below
	 */
	unsigned long		queue_flags;

	/*
	 * ida allocated id for this queue.  Used to index queues from
	 * ioctx.
	 */
	int			id;

	/*
	 * queue needs bounce pages for pages above this limit
	 */
	gfp_t			bounce_gfp;

	/*
	 * protects queue structures from reentrancy. ->__queue_lock should
	 * _never_ be used directly, it is queue private. always use
	 * ->queue_lock.
	 */
	spinlock_t		__queue_lock;
	spinlock_t		*queue_lock;

	/*
	 * queue kobject
	 */
	struct kobject kobj;

#ifdef CONFIG_PM_RUNTIME
	struct device		*dev;
	int			rpm_status;
	unsigned int		nr_pending;
#endif

	/*
	 * queue settings
	 */
	unsigned long		nr_requests;	/* Max # of requests */
	unsigned int		nr_congestion_on;
	unsigned int		nr_congestion_off;
	unsigned int		nr_batching;

	unsigned int		dma_drain_size;
	void			*dma_drain_buffer;
	unsigned int		dma_pad_mask;
	unsigned int		dma_alignment;

	struct blk_queue_tag	*queue_tags;
	struct list_head	tag_busy_list;

	unsigned int		nr_sorted;
	unsigned int		in_flight[2];
	/*
	 * Number of active block driver functions for which blk_drain_queue()
	 * must wait. Must be incremented around functions that unlock the
	 * queue_lock internally, e.g. scsi_request_fn().
	 */
	unsigned int		request_fn_active;

	unsigned int		rq_timeout;
	struct timer_list	timeout;
	struct list_head	timeout_list;

	struct list_head	icq_list;
#ifdef CONFIG_BLK_CGROUP
	DECLARE_BITMAP		(blkcg_pols, BLKCG_MAX_POLS);
	struct blkcg_gq		*root_blkg;
	struct list_head	blkg_list;
#endif

	struct queue_limits	limits;

	/*
	 * sg stuff
	 */
	unsigned int		sg_timeout;
	unsigned int		sg_reserved_size;
	int			node;
#ifdef CONFIG_BLK_DEV_IO_TRACE
	struct blk_trace	*blk_trace;
#endif
	/*
	 * for flush operations
	 */
	unsigned int		flush_flags;
	unsigned int		flush_not_queueable:1;
	unsigned int		flush_queue_delayed:1;
	unsigned int		flush_pending_idx:1;
	unsigned int		flush_running_idx:1;
	unsigned long		flush_pending_since;
	struct list_head	flush_queue[2];
	struct list_head	flush_data_in_flight;
	struct request		flush_rq;

	struct mutex		sysfs_lock;

	int			bypass_depth;

#if defined(CONFIG_BLK_DEV_BSG)
	bsg_job_fn		*bsg_job_fn;
	int			bsg_job_size;
	struct bsg_class_device bsg_dev;
#endif

#ifdef CONFIG_BLK_CGROUP
	struct list_head	all_q_node;
#endif
#ifdef CONFIG_BLK_DEV_THROTTLING
	/* Throttle data */
	struct throtl_data *td;
#endif
	struct rcu_head		rcu_head;
};

二主要函数

1 submit_bh

submit_bh是内核发送IO请求给块设备的函数，目前较新版本的内核中该函数会调用submit_bio执行实际请求。

int submit_bh(int rw, struct buffer_head *bh)
{
	return _submit_bh(rw, bh, 0);
}
int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
{
	struct bio *bio;
	int ret = 0;

	BUG_ON(!buffer_locked(bh));
	BUG_ON(!buffer_mapped(bh));
	BUG_ON(!bh->b_end_io);
	BUG_ON(buffer_delay(bh));
	BUG_ON(buffer_unwritten(bh));

	/*
	 * Only clear out a write error when rewriting
	 */
	if (test_set_buffer_req(bh) && (rw & WRITE))
		clear_buffer_write_io_error(bh);

	/*
	 * from here on down, it's all bio -- do the initial mapping,
	 * submit_bio -> generic_make_request may further map this bio around
	 */
	bio = bio_alloc(GFP_NOIO, 1);

	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
	bio->bi_bdev = bh->b_bdev;
	bio->bi_io_vec[0].bv_page = bh->b_page;
	bio->bi_io_vec[0].bv_len = bh->b_size;
	bio->bi_io_vec[0].bv_offset = bh_offset(bh);

	bio->bi_vcnt = 1;
	bio->bi_size = bh->b_size;

	bio->bi_end_io = end_bio_bh_io_sync;
	bio->bi_private = bh;
	bio->bi_flags |= bio_flags;

	/* Take care of bh's that straddle the end of the device */
	guard_bh_eod(rw, bio, bh);

	if (buffer_meta(bh))
		rw |= REQ_META;
	if (buffer_prio(bh))
		rw |= REQ_PRIO;

	bio_get(bio);
	submit_bio(rw, bio);

	if (bio_flagged(bio, BIO_EOPNOTSUPP))
		ret = -EOPNOTSUPP;

	bio_put(bio);
	return ret;
}

2 submit_bio

submit_bio函数会调用generic_make_request执行实际的bio请求。

void submit_bio(int rw, struct bio *bio)
{
	bio->bi_rw |= rw;

	/*
	 * If it's a regular read/write or a barrier with data attached,
	 * go through the normal accounting stuff before submission.
	 */
	if (bio_has_data(bio)) {
		unsigned int count;

		if (unlikely(rw & REQ_WRITE_SAME))
			count = bdev_logical_block_size(bio->bi_bdev) >> 9;
		else
			count = bio_sectors(bio);

		if (rw & WRITE) {
			count_vm_events(PGPGOUT, count);
		} else {
			task_io_account_read(bio->bi_size);
			count_vm_events(PGPGIN, count);
		}

		if (unlikely(block_dump)) {
			char b[BDEVNAME_SIZE];
			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
			current->comm, task_pid_nr(current),
				(rw & WRITE) ? "WRITE" : "READ",
				(unsigned long long)bio->bi_sector,
				bdevname(bio->bi_bdev, b),
				count);
		}
	}

	generic_make_request(bio);
}

3 generic_make_request
generic_make_request则循环处理bio链表。最终调用request_queue中的make_request_fn处理函数处理实际的IO请求。

void generic_make_request(struct bio *bio)
{
	struct bio_list bio_list_on_stack;

	if (!generic_make_request_checks(bio))
		return;

	/*
	 * We only want one ->make_request_fn to be active at a time, else
	 * stack usage with stacked devices could be a problem.  So use
	 * current->bio_list to keep a list of requests submited by a
	 * make_request_fn function.  current->bio_list is also used as a
	 * flag to say if generic_make_request is currently active in this
	 * task or not.  If it is NULL, then no make_request is active.  If
	 * it is non-NULL, then a make_request is active, and new requests
	 * should be added at the tail
	 */
	if (current->bio_list) {
		bio_list_add(current->bio_list, bio);
		return;
	}

	/* following loop may be a bit non-obvious, and so deserves some
	 * explanation.
	 * Before entering the loop, bio->bi_next is NULL (as all callers
	 * ensure that) so we have a list with a single bio.
	 * We pretend that we have just taken it off a longer list, so
	 * we assign bio_list to a pointer to the bio_list_on_stack,
	 * thus initialising the bio_list of new bios to be
	 * added.  ->make_request() may indeed add some more bios
	 * through a recursive call to generic_make_request.  If it
	 * did, we find a non-NULL value in bio_list and re-enter the loop
	 * from the top.  In this case we really did just take the bio
	 * of the top of the list (no pretending) and so remove it from
	 * bio_list, and call into ->make_request() again.
	 */
	BUG_ON(bio->bi_next);
	bio_list_init(&bio_list_on_stack);
	current->bio_list = &bio_list_on_stack;
	do {
		struct request_queue *q = bdev_get_queue(bio->bi_bdev);

		q->make_request_fn(q, bio);

		bio = bio_list_pop(current->bio_list);
	} while (bio);
	current->bio_list = NULL; /* deactivate */
}

4 make_request_fn

make_request_fn是于具体block设备相关的函数，mmc子系统初始化为blk_queue_bio。mmc_blk_probe()->mmc_blk_alloc()->mmc_blk_alloc_req()->mmc_init_queue()->blk_init_queue()->blk_init_queue_node()->blk_init_allocated_queue()->blk_queue_make_request(q, blk_queue_bio)。这里就用到了IO调度算法，新的bio可能被合并到request_queue中已有的request结构中(甚至合并到已有的bio中)，或者生成新的request，经过这里就bio就变成了request。

void blk_queue_bio(struct request_queue *q, struct bio *bio)
{
	const bool sync = !!(bio->bi_rw & REQ_SYNC);
	struct blk_plug *plug;
	int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
	struct request *req;
	unsigned int request_count = 0;

	/*
	 * low level driver can indicate that it wants pages above a
	 * certain limit bounced to low memory (ie for highmem, or even
	 * ISA dma in theory)
	 */
	blk_queue_bounce(q, &bio);

	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
		bio_endio(bio, -EIO);
		return;
	}

	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
		spin_lock_irq(q->queue_lock);
		where = ELEVATOR_INSERT_FLUSH;
		goto get_rq;
	}

	/*
	 * Check if we can merge with the plugged list before grabbing
	 * any locks.
	 */
	if (attempt_plug_merge(q, bio, &request_count))
		return;

	spin_lock_irq(q->queue_lock);

	el_ret = elv_merge(q, &req, bio);
	if (el_ret == ELEVATOR_BACK_MERGE) {
		if (bio_attempt_back_merge(q, req, bio)) {
			elv_bio_merged(q, req, bio);
			if (!attempt_back_merge(q, req))
				elv_merged_request(q, req, el_ret);
			goto out_unlock;
		}
	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
		if (bio_attempt_front_merge(q, req, bio)) {
			elv_bio_merged(q, req, bio);
			if (!attempt_front_merge(q, req))
				elv_merged_request(q, req, el_ret);
			goto out_unlock;
		}
	}

get_rq:
	/*
	 * This sync check and mask will be re-done in init_request_from_bio(),
	 * but we need to set it earlier to expose the sync flag to the
	 * rq allocator and io schedulers.
	 */
	rw_flags = bio_data_dir(bio);
	if (sync)
		rw_flags |= REQ_SYNC;

	/*
	 * Grab a free request. This is might sleep but can not fail.
	 * Returns with the queue unlocked.
	 */
	req = get_request(q, rw_flags, bio, GFP_NOIO);
	if (unlikely(!req)) {
		bio_endio(bio, -ENODEV);	/* @q is dead */
		goto out_unlock;
	}

	/*
	 * After dropping the lock and possibly sleeping here, our request
	 * may now be mergeable after it had proven unmergeable (above).
	 * We don't worry about that case for efficiency. It won't happen
	 * often, and the elevators are able to handle it.
	 */
	init_request_from_bio(req, bio);

	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
		req->cpu = raw_smp_processor_id();

	plug = current->plug;
	if (plug) {
		/*
		 * If this is the first request added after a plug, fire
		 * of a plug trace. If others have been added before, check
		 * if we have multiple devices in this plug. If so, make a
		 * note to sort the list before dispatch.
		 */
		if (list_empty(&plug->list))
			trace_block_plug(q);
		else {
			if (request_count >= BLK_MAX_REQUEST_COUNT) {
				blk_flush_plug_list(plug, false);
				trace_block_plug(q);
			}
		}
		list_add_tail(&req->queuelist, &plug->list);
		drive_stat_acct(req, 1);
	} else {
		spin_lock_irq(q->queue_lock);
		add_acct_request(q, req, where);
		__blk_run_queue(q);
out_unlock:
		spin_unlock_irq(q->queue_lock);
	}
}

5 blk_init_queue

struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
{
	return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
}

例如mq->queue = blk_init_queue(mmc_request_fn, lock);
分配和初始化请求队列，q->request_fn = mmc_request_fn和blk_queue_make_request(q, blk_queue_bio)就是在这里初始化的。

三块设备驱动开发

1 分配主设备号

int register_blkdev(unsigned int major, const char *name)
{
	struct blk_major_name **n, *p;
	int index, ret = 0;

	mutex_lock(&block_class_lock);

	/* temporary */
	if (major == 0) {
		for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
			if (major_names[index] == NULL)
				break;
		}

		if (index == 0) {
			printk("register_blkdev: failed to get major for %s\n",
			       name);
			ret = -EBUSY;
			goto out;
		}
		major = index;
		ret = major;
	}

	p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
	if (p == NULL) {
		ret = -ENOMEM;
		goto out;
	}

	p->major = major;
	strlcpy(p->name, name, sizeof(p->name));
	p->next = NULL;
	index = major_to_index(major);

	for (n = &major_names[index]; *n; n = &(*n)->next) {
		if ((*n)->major == major)
			break;
	}
	if (!*n)
		*n = p;
	else
		ret = -EBUSY;

	if (ret < 0) {
		printk("register_blkdev: cannot get major %d for %s\n",
		       major, name);
		kfree(p);
	}
out:
	mutex_unlock(&block_class_lock);
	return ret;
}

例如：res = register_blkdev(MMC_BLOCK_MAJOR, "mmc");
MMC_BLOCK_MAJOR是内核预留给mmc block设备的主设备号；我们自己开发的block设备没有预留的主设备号，通常都把major设置0，这样内核会给我们分配一个没被占用的设备号。怎么找到的呢？其实这个设备号动态生成的规则非常简单，内核维护了一个类型为struct blk_major_name的hash表major_names。

struct blk_major_name的hash表major_names。
static struct blk_major_name {
	struct blk_major_name *next;
	int major;
	char name[16];
} *major_names[BLKDEV_MAJOR_HASH_SIZE];

动态主设备号生成就是遍历这个hash表，找到一个没被占用的桶，返回对应的索引号，如果都被占用则注册失败，返回错误。

2 创建和初始化gendisk结构

驱动程序在注册了块设备号之后，需要创建灵魂级的数据结构gendisk, 所有后续IO提交需要用到的数据结构比如：request，request_queue，make_request_fn，request_fn等这些东西都是绑在gendisk这个核心数据结构上的，有了它就有了块设备驱动层所需的一切。

struct gendisk alloc_disk(int minors)

例如：md->disk = alloc_disk(perdev_minors);该函数负责创建一个gendisk数据结构，并设置分区数minors，并会按照只有一个分区的情况先初始化磁盘分区表指针数组gendisk->part_tbl的hd_struct域，后面会通过rescan_partitions重新扫描磁盘分区进行扩展，都是调用的disk_expand_part_tbl来完成，当前只是按一个分区来设置的。
gendisk数据结构创建好后，可以自行设置major，first_minor，fops，queue等成员，并通过set_capacity来设置驱动支持的最大扇区数。
md->disk->major = MMC_BLOCK_MAJOR;
md->disk->first_minor = devidx * perdev_minors;
md->disk->fops = &mmc_bdops;
md->disk->private_data = md;
md->disk->queue = md->queue.queue;
set_capacity(md->disk, size);这里单独再说下关于queue成员的创建，request_queue的创建主要要初始化它的2个重要的成员函数，分别是make_reuqest_fn/request_fn。
blk_queue_make_request(q, blk_queue_bio);//blk_queue_bio
mq->queue = blk_init_queue(mmc_request_fn, lock);//mmc_request_fn

make_request_fn负责处理来自上层文件系统submit_bio过来的bio请求，并通过IO调度模块将它转换为request插入到调度队列里。request_fn负责从IO请求队列里下发请求到card层，或者是从IO调度队列里将request派发到实际的IO请求队列，具体情况要看IO请求队列是不是空的。这两个重要函数是在创建request_queue的时候指定的，创建reuqest_queue可以通过blk_init_queue完成，通过入参指定reuqest_fn函数，blk_init_queue内部使用blk_queue_make_request初始化make_request_fn为通用块层提供bio处理函数。如果希望自己定义make_reuqest函数，则可以通过blk_alloc_queue来创建request_queue，然后手动设置reuqest_fn并通过blk_queue_make_request来指定make_request_fn。

3 向内核添加磁盘

我们分配和初始化好gendisk结构后，需要通过add_disk把磁盘添加进内核。

void add_disk(struct gendisk *disk)
{
	struct backing_dev_info *bdi;
	dev_t devt;
	int retval;

	/* minors == 0 indicates to use ext devt from part0 and should
	 * be accompanied with EXT_DEVT flag.  Make sure all
	 * parameters make sense.
	 */
	WARN_ON(disk->minors && !(disk->major || disk->first_minor));
	WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));

	disk->flags |= GENHD_FL_UP;

	retval = blk_alloc_devt(&disk->part0, &devt);
	if (retval) {
		WARN_ON(1);
		return;
	}
	disk_to_dev(disk)->devt = devt;

	/* ->major and ->first_minor aren't supposed to be
	 * dereferenced from here on, but set them just in case.
	 */
	disk->major = MAJOR(devt);
	disk->first_minor = MINOR(devt);

	disk_alloc_events(disk);

	/* Register BDI before referencing it from bdev */
	bdi = &disk->queue->backing_dev_info;
	bdi_register_dev(bdi, disk_devt(disk));

	blk_register_region(disk_devt(disk), disk->minors, NULL,
			    exact_match, exact_lock, disk);
	register_disk(disk);
	blk_register_queue(disk);

	/*
	 * Take an extra ref on queue which will be put on disk_release()
	 * so that it sticks around as long as @disk is there.
	 */
	WARN_ON_ONCE(!blk_get_queue(disk->queue));

	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
				   "bdi");
	WARN_ON(retval);

	disk_add_events(disk);
}

例如：add_disk(md->disk);首先通过blk_alloc_devt为设备的0号分区分配设备号devt，然后调用blk_register_region完成设备号的注册，这一步就是通过设备映射表bdev_map建立设备号与核心数据结构gendisk的一个映射关系，这一步完成设备的注册。然后会通过register_disk设置对应的sysfs结构，并为这个gendisk的分区0关联一个block_device数据结构，后续会重新扫描磁盘分区表建立好磁盘与分区的关系。

四 IO调度

每个块设备或者块设备的分区，都对应有自身的请求队列(request_queue)，而每个请求队列都可以选择一个I/O调度器来协调所递交的request。当有多个并发IO请求到块设备时，请求的顺序会影响IO的性能，因为普通的机械硬盘需要移动机械臂，相对于电和磁的传递，机械运动是非常慢速的；所以kernel一般会对IO做排序等调度后再发送到块设备层。I/O调度器的基本目的是将请求按照它们对应在块设备上的扇区号进行排列，以减少磁头的移动，提高效率。在前面讨论递交I/O请求的时候可以发现，每个request_queue都有一个request的队列，队列里的请求将按顺序被响应。实际上，除了这个队列，每个调度器自身都维护有不同数量的队列，用来对递交上来的request进行处理，而排在队列最前面的request将适时被移动到request_queue中等待响应。
IO调度算法是一种电梯算法(elevator algorithm)，目前主要有cfq/deadline/anticipatory/noop，其中cfq是Linux的默认策略；anticipatory在新的内核中已经放弃;deadline在大部分OLTP数据库应用中更具优势，IO的响应时间更稳定些；noop只对IO请求进行简单的合并，其他不干涉，类似与先来先服务；在FusionIO等IO性能很好的设备上，noop反而更具优势，所以FusionIO的驱动默认使用了noop。
代码位于block目录，block/elevator.c和include/linux/elevator.h，算法block/deadline-iosched.c。

1 主要数据结构

struct elevator_type
{
	/* managed by elevator core */
	struct kmem_cache *icq_cache;

	/* fields provided by elevator implementation */
	struct elevator_ops ops;
	size_t icq_size;	/* see iocontext.h */
	size_t icq_align;	/* ditto */
	struct elv_fs_entry *elevator_attrs;
	char elevator_name[ELV_NAME_MAX];
	struct module *elevator_owner;

	/* managed by elevator core */
	char icq_cache_name[ELV_NAME_MAX + 5];	/* elvname + "_io_cq" */
	struct list_head list;
};
struct elevator_queue
{
	struct elevator_type *type;
	void *elevator_data;
	struct kobject kobj;
	struct mutex sysfs_lock;
	unsigned int registered:1;
	DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
};

elevator_type对应一个调度器类型，elevator_queue对应一个调度器实例，如果内核中只有上述四种类型的调度器，则只有四个elevator_type；但是多个块设备(分区)可拥有多个相应调度器的实例，也就是elevator_queue。两个数据结构中最关键的元素都是struct elevator_ops，该结构定义了一组操作函数，用来描述请求队列的相关算法，实现对请求的处理。

struct elevator_ops
{
	elevator_merge_fn *elevator_merge_fn;
	elevator_merged_fn *elevator_merged_fn;
	elevator_merge_req_fn *elevator_merge_req_fn;
	elevator_allow_merge_fn *elevator_allow_merge_fn;
	elevator_bio_merged_fn *elevator_bio_merged_fn;

	elevator_dispatch_fn *elevator_dispatch_fn;
	elevator_add_req_fn *elevator_add_req_fn;
	elevator_activate_req_fn *elevator_activate_req_fn;
	elevator_deactivate_req_fn *elevator_deactivate_req_fn;

	elevator_completed_req_fn *elevator_completed_req_fn;

	elevator_request_list_fn *elevator_former_req_fn;
	elevator_request_list_fn *elevator_latter_req_fn;

	elevator_init_icq_fn *elevator_init_icq_fn;	/* see iocontext.h */
	elevator_exit_icq_fn *elevator_exit_icq_fn;	/* ditto */

	elevator_set_req_fn *elevator_set_req_fn;
	elevator_put_req_fn *elevator_put_req_fn;

	elevator_may_queue_fn *elevator_may_queue_fn;

	elevator_init_fn *elevator_init_fn;
	elevator_exit_fn *elevator_exit_fn;
};

elevator_merge_fn查询一个request，用于将bio并入。
elevator_merge_req_fn将两个合并后的请求中多余的那个给删除。
elevator_dispatch_fn将调度器的队列最前面的元素取出，分派给request_queue中的请求队列以等候响应。
elevator_add_req_fn将一个新的request添加进调度器的队列。
elevator_set_req_fn和elevator_put_req_fn分别在创建新请求和将请求所占的空间释放到内存时调用。
elevator_init_fn用于初始化调度器实例。

2 elevator_init分配IO调度器elevator_queue

例如：
mmc_blk_probe()->mmc_blk_alloc()->mmc_blk_alloc_req()->mmc_init_queue()->blk_init_queue()->blk_init_queue_node()->blk_init_allocated_queue()->elevator_init(q, NULL)

int elevator_init(struct request_queue *q, char *name)
{
	struct elevator_type *e = NULL;
	int err;

	if (unlikely(q->elevator))
		return 0;

	INIT_LIST_HEAD(&q->queue_head);
	q->last_merge = NULL;
	q->end_sector = 0;
	q->boundary_rq = NULL;

	if (name) {
		e = elevator_get(name, true);
		if (!e)
			return -EINVAL;
	}

	/*
	 * Use the default elevator specified by config boot param or
	 * config option.  Don't try to load modules as we could be running
	 * off async and request_module() isn't allowed from async.
	 */
	if (!e && *chosen_elevator) {
		e = elevator_get(chosen_elevator, false);
		if (!e)
			printk(KERN_ERR "I/O scheduler %s not found\n",
							chosen_elevator);
	}

	if (!e) {
		e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
		if (!e) {
			printk(KERN_ERR
				"Default I/O scheduler not found. " \
				"Using noop.\n");
			e = elevator_get("noop", false);
		}
	}

	err = e->ops.elevator_init_fn(q, e);
	return 0;
}

(1) 如果name指定了elevator_type。

用e = elevator_get(name, true);来找一个elevator_type。为什么能找到呢？看一下elevator_get()。

static struct elevator_type *elevator_get(const char *name, bool try_loading)
{
	struct elevator_type *e;

	spin_lock(&elv_list_lock);

	e = elevator_find(name);
	if (!e && try_loading) {
		spin_unlock(&elv_list_lock);
		request_module("%s-iosched", name);
		spin_lock(&elv_list_lock);
		e = elevator_find(name);
	}

	if (e && !try_module_get(e->elevator_owner))
		e = NULL;

	spin_unlock(&elv_list_lock);

	return e;
}
static struct elevator_type *elevator_find(const char *name)
{
	struct elevator_type *e;

	list_for_each_entry(e, &elv_list, list) {
		if (!strcmp(e->elevator_name, name))
			return e;
	}

	return NULL;
}

是遍历elv_list，找到e->elevator_name和name相同的elevator_type *e。这个elv_list上挂的就是elevator_type *e了，是什么时候挂的？是调度器注册的时候，比如：
deadline_init()->elv_register()->list_add_tail(&e->list, &elv_list)。

(2) name没有指定，用*chosen_elevator指定的elevator_type。

chosen_elevator存的也是elevator_type的name，是什么时候设置的呢？

static int __init elevator_setup(char *str)
{
	/*
	 * Be backwards-compatible with previous kernels, so users
	 * won't get the wrong elevator.
	 */
	strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
	return 1;
}
__setup("elevator=", elevator_setup);

include/linux/init.h中有__setup宏的定义，细节见。

__setup宏最终会定义一个obs_kernel_param类型的结构，并放在.init.setup段。

struct obs_kernel_param {
	const char *str;
	int (*setup_func)(char *);
	int early;
};

kernel在处理启动参数时，会与每个结构中的str对比，如果完全相同，就会调用该数据项的函数指针成员setup_func所指向的函数，并将启动参数后面的内容传给该处理函数作为参数。比如__setup("elevator=", elevator_setup);定义之后，.init.setup段就多了这样一个结构：
static const char __setup_str_elevator_setup[] __initconst
__aligned(1) = "elevator=";
static struct obs_kernel_param __setup_elevator_setup
__used __section(.init.setup)
__attribute__((aligned((sizeof(long)))))
= { __setup_str_elevator_setup, elevator_setup, 0}
start_kernel()->parse_args()->unknown_bootoption()->obsolete_checksetup()->(p->setup_func(line + n))
kernel处理启动参数时，将启动参数和__setup_start和__setup_end(就是.init.setup段的内容)之间的结构比较，
include/asm-generic/vmlinux.lds.h

include/asm-generic/vmlinux.lds.h
#define INIT_SETUP(initsetup_align)					\
		. = ALIGN(initsetup_align);				\
		VMLINUX_SYMBOL(__setup_start) = .;			\
		*(.init.setup)						\
		VMLINUX_SYMBOL(__setup_end) = .;

找到为”elevator=”的时候，就会执行elevator_setup(“deadline”)；“deadline”就是内核启动参数传过来的。
early_param宏和__setup宏差不多，只是early=1。

#define __setup(str, fn)					\
	__setup_param(str, fn, fn, 0)

/* NOTE: fn is as per module_param, not __setup!  Emits warning if fn
 * returns non-zero. */
#define early_param(str, fn)					\
	__setup_param(str, fn, fn, 1)

start_kernel()->parse_early_param()->parse_early_options()->parse_args()->do_early_param()->(p->setup_func(val))
do_early_param()中会判断p->early为1才会执行p->setup_func(val)。

(3) 前两个都没有指定，用CONFIG_DEFAULT_IOSCHED默认配置的elevator_type。
(4) 如果走到这里，那就只能用默认的noop调度器了。
(5) err = e->ops.elevator_init_fn(q, e)。

以deadline为例.elevator_init_fn = deadline_init_queue,

static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
{
	struct deadline_data *dd;
	struct elevator_queue *eq;

	eq = elevator_alloc(q, e);
	if (!eq)
		return -ENOMEM;

	dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
	if (!dd) {
		kobject_put(&eq->kobj);
		return -ENOMEM;
	}
	eq->elevator_data = dd;

	INIT_LIST_HEAD(&dd->fifo_list[READ]);
	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
	dd->sort_list[READ] = RB_ROOT;
	dd->sort_list[WRITE] = RB_ROOT;
	dd->fifo_expire[READ] = read_expire;
	dd->fifo_expire[WRITE] = write_expire;
	dd->writes_starved = writes_starved;
	dd->front_merges = 1;
	dd->fifo_batch = fifo_batch;

	spin_lock_irq(q->queue_lock);
	q->elevator = eq;
	spin_unlock_irq(q->queue_lock);
	return 0;
}

(a) elevator_alloc()分配和初始化一个调度器的实例elevator_queue，其成员type就是刚找到的elevator_type。
(b) 分配和初始化deadline调度器的私有数据deadline_data。
(c) request_queue(q)、elevator_queue(eq)、deadline_data(dd)之间的关联，
eq->elevator_data = dd;
q->elevator = eq;

显示全文

全部栏目