1 struct bio 表示一次块设备的IO请求。
2 struct request是bio(elevator)提交给IO调度器后产生的数据,一个request中存放着顺序排列的bio。当设备提交bio 给IO调度器时,新的bio可能被合并到request_queue中已有的request结构中(甚至合并到已有的bio中),或者生成新的request。request表示等待处理的块设备IO请求,结构里有sector等信息;通过queuelist把自己挂在它依附的request_queue上。
3 struct request_queue每个物理设备对应一个,是request所形成的list,其结构中有一些函数指针。
struct gendisk {
/* major, first_minor and minors are input parameters only,
* don't use directly. Use disk_devt() and disk_max_parts().
*/
int major; /* major number of driver */
int first_minor;
int minors; /* maximum number of minors, =1 for
* disks that can't be partitioned. */
char disk_name[DISK_NAME_LEN]; /* name of major driver */
char *(*devnode)(struct gendisk *gd, umode_t *mode);
unsigned int events; /* supported events */
unsigned int async_events; /* async events, subset of all */
/* Array of pointers to partitions indexed by partno.
* Protected with matching bdev lock but stat and other
* non-critical accesses use RCU. Always access through
* helpers.
*/
struct disk_part_tbl __rcu *part_tbl;
struct hd_struct part0;
const struct block_device_operations *fops;
struct request_queue *queue;
void *private_data;
int flags;
struct device *driverfs_dev; // FIXME: remove
struct kobject *slave_dir;
struct timer_rand_state *random;
atomic_t sync_io; /* RAID */
struct disk_events *ev;
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct blk_integrity *integrity;
#endif
int node_id;
};
major:块设备的主设备号。
first_minor:起始次设备号。
minors:描述了该块设备有多少个次设备号,或者说有多少个分区,如果minors为1,则表示该块设备没有分区。
part_tbl:整个块设备的分区信息都包含在里面,其核心结构是一个struct hd_struct的指针数组,每一项都指向一个描述分区的hd_struct结构。
fops:指向特定于设备的底层操作函数集。
queue:块设备的请求队列,所有针对该设备的请求都会放入该请求队列中,经过I/O scheduler的处理再进行提交。
struct hd_struct {
sector_t start_sect;
/*
* nr_sects is protected by sequence counter. One might extend a
* partition while IO is happening to it and update of nr_sects
* can be non-atomic on 32bit machines with 64bit sector_t.
*/
sector_t nr_sects;
seqcount_t nr_sects_seq;
sector_t alignment_offset;
unsigned int discard_alignment;
struct device __dev;
struct kobject *holder_dir;
int policy, partno;
struct partition_meta_info *info;
#ifdef CONFIG_FAIL_MAKE_REQUEST
int make_it_fail;
#endif
unsigned long stamp;
atomic_t in_flight[2];
#ifdef CONFIG_SMP
struct disk_stats __percpu *dkstats;
#else
struct disk_stats dkstats;
#endif
atomic_t ref;
struct rcu_head rcu_head;
};
struct disk_part_tbl {
struct rcu_head rcu_head;
int len;
struct hd_struct __rcu *last_lookup;
struct hd_struct __rcu *part[];
};
struct block_device {
dev_t bd_dev; /* not a kdev_t - it's a search key */
int bd_openers;
struct inode * bd_inode; /* will die */
struct super_block * bd_super;
struct mutex bd_mutex; /* open/close mutex */
struct list_head bd_inodes;
void * bd_claiming;
void * bd_holder;
int bd_holders;
bool bd_write_holder;
#ifdef CONFIG_SYSFS
struct list_head bd_holder_disks;
#endif
struct block_device * bd_contains;
unsigned bd_block_size;
struct hd_struct * bd_part;
/* number of times partitions within this device have been opened. */
unsigned bd_part_count;
int bd_invalidated;
struct gendisk * bd_disk;
struct request_queue * bd_queue;
struct list_head bd_list;
/*
* Private data. You must have bd_claim'ed the block_device
* to use this. NOTE: bd_claim allows an owner to claim
* the same device multiple times, the owner must take special
* care to not mess up bd_private for that case.
*/
unsigned long bd_private;
/* The counter of freeze processes */
int bd_fsfreeze_count;
/* Mutex for freeze */
struct mutex bd_fsfreeze_mutex;
};
bd_dev:该设备(分区)的设备号。
bd_inode:指向该设备文件的inode。
bd_openers:一个引用计数,记录了该块设备打开的次数,或者说有多少个进程打开了该设备。
bd_contains:如果该block_device描述的是一个分区,则该变量指向描述主块设备的block_device,反之,其指向本身。
bd_part:如果该block_device描述的是一个分区,则该变量指向分区的信息。
bd_part_count:如果是分区,该变量记录了分区被打开的次数,在进行分区的重新扫描前,要保证该计数值为0。
bd_disk:指向描述整个设备的gendisk结构。
struct buffer_head {
unsigned long b_state; /* buffer state bitmap (see above) */
struct buffer_head *b_this_page;/* circular list of page's buffers */
struct page *b_page; /* the page this bh is mapped to */
sector_t b_blocknr; /* start block number */
size_t b_size; /* size of mapping */
char *b_data; /* pointer to data within the page */
struct block_device *b_bdev;
bh_end_io_t *b_end_io; /* I/O completion */
void *b_private; /* reserved for b_end_io */
struct list_head b_assoc_buffers; /* associated with another mapping */
struct address_space *b_assoc_map; /* mapping this buffer is
associated with */
atomic_t b_count; /* users using this buffer_head */
};
struct bio {
sector_t bi_sector; /* device address in 512 byte
sectors */
struct bio *bi_next; /* request queue link */
struct block_device *bi_bdev;
unsigned long bi_flags; /* status, command, etc */
unsigned long bi_rw; /* bottom bits READ/WRITE,
* top bits priority
*/
unsigned short bi_vcnt; /* how many bio_vec's */
unsigned short bi_idx; /* current index into bvl_vec */
/* Number of segments in this BIO after
* physical address coalescing is performed.
*/
unsigned int bi_phys_segments;
unsigned int bi_size; /* residual I/O count */
/*
* To keep track of the max segment size, we account for the
* sizes of the first and last mergeable segments in this bio.
*/
unsigned int bi_seg_front_size;
unsigned int bi_seg_back_size;
bio_end_io_t *bi_end_io;
void *bi_private;
#ifdef CONFIG_BLK_CGROUP
/*
* Optional ioc and css associated with this bio. Put on bio
* release. Read comment on top of bio_associate_current().
*/
struct io_context *bi_ioc;
struct cgroup_subsys_state *bi_css;
#endif
#if defined(CONFIG_BLK_DEV_INTEGRITY)
struct bio_integrity_payload *bi_integrity; /* data integrity */
#endif
/*
* Everything starting with bi_max_vecs will be preserved by bio_reset()
*/
unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
atomic_t bi_cnt; /* pin count */
struct bio_vec *bi_io_vec; /* the actual vec list */
struct bio_set *bi_pool;
/*
* We can inline a number of vecs at the end of the bio, to avoid
* double allocations for a small number of bio_vecs. This member
* MUST obviously be kept at the very end of the bio.
*/
struct bio_vec bi_inline_vecs[0];
};
bi_sector:该I/O操作的起始扇区号。
bi_rw:指明了读写方向。
bi_vcnt:该I/O操作中涉及到了多少个缓存向量,每个缓存向量由[page,offset,len]来描述。
bi_idx:指示当前的缓存向量。
bi_io_vec:缓存向量数组。
struct bio_vec {
struct page *bv_page;
unsigned int bv_len;
unsigned int bv_offset;
};
struct request {
struct list_head queuelist;
struct call_single_data csd;
struct request_queue *q;
unsigned int cmd_flags;
enum rq_cmd_type_bits cmd_type;
unsigned long atomic_flags;
int cpu;
/* the following two fields are internal, NEVER access directly */
unsigned int __data_len; /* total data len */
sector_t __sector; /* sector cursor */
struct bio *bio;
struct bio *biotail;
struct hlist_node hash; /* merge hash */
/*
* The rb_node is only used inside the io scheduler, requests
* are pruned when moved to the dispatch queue. So let the
* completion_data share space with the rb_node.
*/
union {
struct rb_node rb_node; /* sort/lookup */
void *completion_data;
};
/*
* Three pointers are available for the IO schedulers, if they need
* more they have to dynamically allocate it. Flush requests are
* never put on the IO scheduler. So let the flush fields share
* space with the elevator data.
*/
union {
struct {
struct io_cq *icq;
void *priv[2];
} elv;
struct {
unsigned int seq;
struct list_head list;
rq_end_io_fn *saved_end_io;
} flush;
};
struct gendisk *rq_disk;
struct hd_struct *part;
unsigned long start_time;
#ifdef CONFIG_BLK_CGROUP
struct request_list *rl; /* rl this rq is alloced from */
unsigned long long start_time_ns;
unsigned long long io_start_time_ns; /* when passed to hardware */
#endif
/* Number of scatter-gather DMA addr+len pairs after
* physical address coalescing is performed.
*/
unsigned short nr_phys_segments;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
unsigned short nr_integrity_segments;
#endif
unsigned short ioprio;
int ref_count;
void *special; /* opaque pointer available for LLD use */
char *buffer; /* kaddr of the current segment if available */
int tag;
int errors;
/*
* when request is used as a packet command carrier
*/
unsigned char __cmd[BLK_MAX_CDB];
unsigned char *cmd;
unsigned short cmd_len;
unsigned int extra_len; /* length of alignment and padding */
unsigned int sense_len;
unsigned int resid_len; /* residual count */
void *sense;
unsigned long deadline;
struct list_head timeout_list;
unsigned int timeout;
int retries;
/*
* completion callback.
*/
rq_end_io_fn *end_io;
void *end_io_data;
/* for bidi */
struct request *next_rq;
};
queuelist:用于将request链入请求队列的链表元素。
q:指向所属的请求队列。
__sector:下一个要传输的bio的起始扇区号。
__data_len:request要传输的数据字节数。
bio,biotail:用于维护request中的bio链表。
struct request_queue {
/*
* Together with queue_head for cacheline sharing
*/
struct list_head queue_head;
struct request *last_merge;
struct elevator_queue *elevator;
int nr_rqs[2]; /* # allocated [a]sync rqs */
int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
/*
* If blkcg is not used, @q->root_rl serves all requests. If blkcg
* is used, root blkg allocates from @q->root_rl and all other
* blkgs from their own blkg->rl. Which one to use should be
* determined using bio_request_list().
*/
struct request_list root_rl;
request_fn_proc *request_fn;
make_request_fn *make_request_fn;
prep_rq_fn *prep_rq_fn;
unprep_rq_fn *unprep_rq_fn;
merge_bvec_fn *merge_bvec_fn;
softirq_done_fn *softirq_done_fn;
rq_timed_out_fn *rq_timed_out_fn;
dma_drain_needed_fn *dma_drain_needed;
lld_busy_fn *lld_busy_fn;
/*
* Dispatch queue sorting
*/
sector_t end_sector;
struct request *boundary_rq;
/*
* Delayed queue handling
*/
struct delayed_work delay_work;
struct backing_dev_info backing_dev_info;
/*
* The queue owner gets to use this for whatever they like.
* ll_rw_blk doesn't touch it.
*/
void *queuedata;
/*
* various queue flags, see QUEUE_* below
*/
unsigned long queue_flags;
/*
* ida allocated id for this queue. Used to index queues from
* ioctx.
*/
int id;
/*
* queue needs bounce pages for pages above this limit
*/
gfp_t bounce_gfp;
/*
* protects queue structures from reentrancy. ->__queue_lock should
* _never_ be used directly, it is queue private. always use
* ->queue_lock.
*/
spinlock_t __queue_lock;
spinlock_t *queue_lock;
/*
* queue kobject
*/
struct kobject kobj;
#ifdef CONFIG_PM_RUNTIME
struct device *dev;
int rpm_status;
unsigned int nr_pending;
#endif
/*
* queue settings
*/
unsigned long nr_requests; /* Max # of requests */
unsigned int nr_congestion_on;
unsigned int nr_congestion_off;
unsigned int nr_batching;
unsigned int dma_drain_size;
void *dma_drain_buffer;
unsigned int dma_pad_mask;
unsigned int dma_alignment;
struct blk_queue_tag *queue_tags;
struct list_head tag_busy_list;
unsigned int nr_sorted;
unsigned int in_flight[2];
/*
* Number of active block driver functions for which blk_drain_queue()
* must wait. Must be incremented around functions that unlock the
* queue_lock internally, e.g. scsi_request_fn().
*/
unsigned int request_fn_active;
unsigned int rq_timeout;
struct timer_list timeout;
struct list_head timeout_list;
struct list_head icq_list;
#ifdef CONFIG_BLK_CGROUP
DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS);
struct blkcg_gq *root_blkg;
struct list_head blkg_list;
#endif
struct queue_limits limits;
/*
* sg stuff
*/
unsigned int sg_timeout;
unsigned int sg_reserved_size;
int node;
#ifdef CONFIG_BLK_DEV_IO_TRACE
struct blk_trace *blk_trace;
#endif
/*
* for flush operations
*/
unsigned int flush_flags;
unsigned int flush_not_queueable:1;
unsigned int flush_queue_delayed:1;
unsigned int flush_pending_idx:1;
unsigned int flush_running_idx:1;
unsigned long flush_pending_since;
struct list_head flush_queue[2];
struct list_head flush_data_in_flight;
struct request flush_rq;
struct mutex sysfs_lock;
int bypass_depth;
#if defined(CONFIG_BLK_DEV_BSG)
bsg_job_fn *bsg_job_fn;
int bsg_job_size;
struct bsg_class_device bsg_dev;
#endif
#ifdef CONFIG_BLK_CGROUP
struct list_head all_q_node;
#endif
#ifdef CONFIG_BLK_DEV_THROTTLING
/* Throttle data */
struct throtl_data *td;
#endif
struct rcu_head rcu_head;
};
int submit_bh(int rw, struct buffer_head *bh)
{
return _submit_bh(rw, bh, 0);
}
int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
{
struct bio *bio;
int ret = 0;
BUG_ON(!buffer_locked(bh));
BUG_ON(!buffer_mapped(bh));
BUG_ON(!bh->b_end_io);
BUG_ON(buffer_delay(bh));
BUG_ON(buffer_unwritten(bh));
/*
* Only clear out a write error when rewriting
*/
if (test_set_buffer_req(bh) && (rw & WRITE))
clear_buffer_write_io_error(bh);
/*
* from here on down, it's all bio -- do the initial mapping,
* submit_bio -> generic_make_request may further map this bio around
*/
bio = bio_alloc(GFP_NOIO, 1);
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_io_vec[0].bv_page = bh->b_page;
bio->bi_io_vec[0].bv_len = bh->b_size;
bio->bi_io_vec[0].bv_offset = bh_offset(bh);
bio->bi_vcnt = 1;
bio->bi_size = bh->b_size;
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
bio->bi_flags |= bio_flags;
/* Take care of bh's that straddle the end of the device */
guard_bh_eod(rw, bio, bh);
if (buffer_meta(bh))
rw |= REQ_META;
if (buffer_prio(bh))
rw |= REQ_PRIO;
bio_get(bio);
submit_bio(rw, bio);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
bio_put(bio);
return ret;
}
void submit_bio(int rw, struct bio *bio)
{
bio->bi_rw |= rw;
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
if (bio_has_data(bio)) {
unsigned int count;
if (unlikely(rw & REQ_WRITE_SAME))
count = bdev_logical_block_size(bio->bi_bdev) >> 9;
else
count = bio_sectors(bio);
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
task_io_account_read(bio->bi_size);
count_vm_events(PGPGIN, count);
}
if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
current->comm, task_pid_nr(current),
(rw & WRITE) ? "WRITE" : "READ",
(unsigned long long)bio->bi_sector,
bdevname(bio->bi_bdev, b),
count);
}
}
generic_make_request(bio);
}
3 generic_make_request
void generic_make_request(struct bio *bio)
{
struct bio_list bio_list_on_stack;
if (!generic_make_request_checks(bio))
return;
/*
* We only want one ->make_request_fn to be active at a time, else
* stack usage with stacked devices could be a problem. So use
* current->bio_list to keep a list of requests submited by a
* make_request_fn function. current->bio_list is also used as a
* flag to say if generic_make_request is currently active in this
* task or not. If it is NULL, then no make_request is active. If
* it is non-NULL, then a make_request is active, and new requests
* should be added at the tail
*/
if (current->bio_list) {
bio_list_add(current->bio_list, bio);
return;
}
/* following loop may be a bit non-obvious, and so deserves some
* explanation.
* Before entering the loop, bio->bi_next is NULL (as all callers
* ensure that) so we have a list with a single bio.
* We pretend that we have just taken it off a longer list, so
* we assign bio_list to a pointer to the bio_list_on_stack,
* thus initialising the bio_list of new bios to be
* added. ->make_request() may indeed add some more bios
* through a recursive call to generic_make_request. If it
* did, we find a non-NULL value in bio_list and re-enter the loop
* from the top. In this case we really did just take the bio
* of the top of the list (no pretending) and so remove it from
* bio_list, and call into ->make_request() again.
*/
BUG_ON(bio->bi_next);
bio_list_init(&bio_list_on_stack);
current->bio_list = &bio_list_on_stack;
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
q->make_request_fn(q, bio);
bio = bio_list_pop(current->bio_list);
} while (bio);
current->bio_list = NULL; /* deactivate */
}
void blk_queue_bio(struct request_queue *q, struct bio *bio)
{
const bool sync = !!(bio->bi_rw & REQ_SYNC);
struct blk_plug *plug;
int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
struct request *req;
unsigned int request_count = 0;
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
* ISA dma in theory)
*/
blk_queue_bounce(q, &bio);
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
bio_endio(bio, -EIO);
return;
}
if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
spin_lock_irq(q->queue_lock);
where = ELEVATOR_INSERT_FLUSH;
goto get_rq;
}
/*
* Check if we can merge with the plugged list before grabbing
* any locks.
*/
if (attempt_plug_merge(q, bio, &request_count))
return;
spin_lock_irq(q->queue_lock);
el_ret = elv_merge(q, &req, bio);
if (el_ret == ELEVATOR_BACK_MERGE) {
if (bio_attempt_back_merge(q, req, bio)) {
elv_bio_merged(q, req, bio);
if (!attempt_back_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out_unlock;
}
} else if (el_ret == ELEVATOR_FRONT_MERGE) {
if (bio_attempt_front_merge(q, req, bio)) {
elv_bio_merged(q, req, bio);
if (!attempt_front_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out_unlock;
}
}
get_rq:
/*
* This sync check and mask will be re-done in init_request_from_bio(),
* but we need to set it earlier to expose the sync flag to the
* rq allocator and io schedulers.
*/
rw_flags = bio_data_dir(bio);
if (sync)
rw_flags |= REQ_SYNC;
/*
* Grab a free request. This is might sleep but can not fail.
* Returns with the queue unlocked.
*/
req = get_request(q, rw_flags, bio, GFP_NOIO);
if (unlikely(!req)) {
bio_endio(bio, -ENODEV); /* @q is dead */
goto out_unlock;
}
/*
* After dropping the lock and possibly sleeping here, our request
* may now be mergeable after it had proven unmergeable (above).
* We don't worry about that case for efficiency. It won't happen
* often, and the elevators are able to handle it.
*/
init_request_from_bio(req, bio);
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
req->cpu = raw_smp_processor_id();
plug = current->plug;
if (plug) {
/*
* If this is the first request added after a plug, fire
* of a plug trace. If others have been added before, check
* if we have multiple devices in this plug. If so, make a
* note to sort the list before dispatch.
*/
if (list_empty(&plug->list))
trace_block_plug(q);
else {
if (request_count >= BLK_MAX_REQUEST_COUNT) {
blk_flush_plug_list(plug, false);
trace_block_plug(q);
}
}
list_add_tail(&req->queuelist, &plug->list);
drive_stat_acct(req, 1);
} else {
spin_lock_irq(q->queue_lock);
add_acct_request(q, req, where);
__blk_run_queue(q);
out_unlock:
spin_unlock_irq(q->queue_lock);
}
}
struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
{
return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
}
例如mq->queue = blk_init_queue(mmc_request_fn, lock);
int register_blkdev(unsigned int major, const char *name)
{
struct blk_major_name **n, *p;
int index, ret = 0;
mutex_lock(&block_class_lock);
/* temporary */
if (major == 0) {
for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
if (major_names[index] == NULL)
break;
}
if (index == 0) {
printk("register_blkdev: failed to get major for %s\n",
name);
ret = -EBUSY;
goto out;
}
major = index;
ret = major;
}
p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
if (p == NULL) {
ret = -ENOMEM;
goto out;
}
p->major = major;
strlcpy(p->name, name, sizeof(p->name));
p->next = NULL;
index = major_to_index(major);
for (n = &major_names[index]; *n; n = &(*n)->next) {
if ((*n)->major == major)
break;
}
if (!*n)
*n = p;
else
ret = -EBUSY;
if (ret < 0) {
printk("register_blkdev: cannot get major %d for %s\n",
major, name);
kfree(p);
}
out:
mutex_unlock(&block_class_lock);
return ret;
}
例如:res = register_blkdev(MMC_BLOCK_MAJOR, "mmc");
struct blk_major_name的hash表major_names。
static struct blk_major_name {
struct blk_major_name *next;
int major;
char name[16];
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
struct gendisk alloc_disk(int minors)
void add_disk(struct gendisk *disk)
{
struct backing_dev_info *bdi;
dev_t devt;
int retval;
/* minors == 0 indicates to use ext devt from part0 and should
* be accompanied with EXT_DEVT flag. Make sure all
* parameters make sense.
*/
WARN_ON(disk->minors && !(disk->major || disk->first_minor));
WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));
disk->flags |= GENHD_FL_UP;
retval = blk_alloc_devt(&disk->part0, &devt);
if (retval) {
WARN_ON(1);
return;
}
disk_to_dev(disk)->devt = devt;
/* ->major and ->first_minor aren't supposed to be
* dereferenced from here on, but set them just in case.
*/
disk->major = MAJOR(devt);
disk->first_minor = MINOR(devt);
disk_alloc_events(disk);
/* Register BDI before referencing it from bdev */
bdi = &disk->queue->backing_dev_info;
bdi_register_dev(bdi, disk_devt(disk));
blk_register_region(disk_devt(disk), disk->minors, NULL,
exact_match, exact_lock, disk);
register_disk(disk);
blk_register_queue(disk);
/*
* Take an extra ref on queue which will be put on disk_release()
* so that it sticks around as long as @disk is there.
*/
WARN_ON_ONCE(!blk_get_queue(disk->queue));
retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
"bdi");
WARN_ON(retval);
disk_add_events(disk);
}
struct elevator_type
{
/* managed by elevator core */
struct kmem_cache *icq_cache;
/* fields provided by elevator implementation */
struct elevator_ops ops;
size_t icq_size; /* see iocontext.h */
size_t icq_align; /* ditto */
struct elv_fs_entry *elevator_attrs;
char elevator_name[ELV_NAME_MAX];
struct module *elevator_owner;
/* managed by elevator core */
char icq_cache_name[ELV_NAME_MAX + 5]; /* elvname + "_io_cq" */
struct list_head list;
};
struct elevator_queue
{
struct elevator_type *type;
void *elevator_data;
struct kobject kobj;
struct mutex sysfs_lock;
unsigned int registered:1;
DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
};
elevator_type对应一个调度器类型,elevator_queue对应一个调度器实例,如果内核中只有上述四种类型的调度器,则只有四个elevator_type;但是多个块设备(分区)可拥有多个相应调度器的实例,也就是elevator_queue。两个数据结构中最关键的元素都是struct elevator_ops,该结构定义了一组操作函数,用来描述请求队列的相关算法,实现对请求的处理。
struct elevator_ops
{
elevator_merge_fn *elevator_merge_fn;
elevator_merged_fn *elevator_merged_fn;
elevator_merge_req_fn *elevator_merge_req_fn;
elevator_allow_merge_fn *elevator_allow_merge_fn;
elevator_bio_merged_fn *elevator_bio_merged_fn;
elevator_dispatch_fn *elevator_dispatch_fn;
elevator_add_req_fn *elevator_add_req_fn;
elevator_activate_req_fn *elevator_activate_req_fn;
elevator_deactivate_req_fn *elevator_deactivate_req_fn;
elevator_completed_req_fn *elevator_completed_req_fn;
elevator_request_list_fn *elevator_former_req_fn;
elevator_request_list_fn *elevator_latter_req_fn;
elevator_init_icq_fn *elevator_init_icq_fn; /* see iocontext.h */
elevator_exit_icq_fn *elevator_exit_icq_fn; /* ditto */
elevator_set_req_fn *elevator_set_req_fn;
elevator_put_req_fn *elevator_put_req_fn;
elevator_may_queue_fn *elevator_may_queue_fn;
elevator_init_fn *elevator_init_fn;
elevator_exit_fn *elevator_exit_fn;
};
elevator_merge_fn查询一个request,用于将bio并入。
int elevator_init(struct request_queue *q, char *name)
{
struct elevator_type *e = NULL;
int err;
if (unlikely(q->elevator))
return 0;
INIT_LIST_HEAD(&q->queue_head);
q->last_merge = NULL;
q->end_sector = 0;
q->boundary_rq = NULL;
if (name) {
e = elevator_get(name, true);
if (!e)
return -EINVAL;
}
/*
* Use the default elevator specified by config boot param or
* config option. Don't try to load modules as we could be running
* off async and request_module() isn't allowed from async.
*/
if (!e && *chosen_elevator) {
e = elevator_get(chosen_elevator, false);
if (!e)
printk(KERN_ERR "I/O scheduler %s not found\n",
chosen_elevator);
}
if (!e) {
e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
if (!e) {
printk(KERN_ERR
"Default I/O scheduler not found. " \
"Using noop.\n");
e = elevator_get("noop", false);
}
}
err = e->ops.elevator_init_fn(q, e);
return 0;
}
用e = elevator_get(name, true);来找一个elevator_type。为什么能找到呢?看一下elevator_get()。
static struct elevator_type *elevator_get(const char *name, bool try_loading)
{
struct elevator_type *e;
spin_lock(&elv_list_lock);
e = elevator_find(name);
if (!e && try_loading) {
spin_unlock(&elv_list_lock);
request_module("%s-iosched", name);
spin_lock(&elv_list_lock);
e = elevator_find(name);
}
if (e && !try_module_get(e->elevator_owner))
e = NULL;
spin_unlock(&elv_list_lock);
return e;
}
static struct elevator_type *elevator_find(const char *name)
{
struct elevator_type *e;
list_for_each_entry(e, &elv_list, list) {
if (!strcmp(e->elevator_name, name))
return e;
}
return NULL;
}
是遍历elv_list,找到e->elevator_name和name相同的elevator_type *e。这个elv_list上挂的就是elevator_type *e了,是什么时候挂的?是调度器注册的时候,比如:
static int __init elevator_setup(char *str)
{
/*
* Be backwards-compatible with previous kernels, so users
* won't get the wrong elevator.
*/
strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
return 1;
}
__setup("elevator=", elevator_setup);
include/linux/init.h中有__setup宏的定义,细节见
。
__setup宏最终会定义一个obs_kernel_param类型的结构,并放在.init.setup段。
struct obs_kernel_param {
const char *str;
int (*setup_func)(char *);
int early;
};
kernel在处理启动参数时,会与每个结构中的str对比,如果完全相同,就会调用该数据项的函数指针成员setup_func所指向的函数,并将启动参数后面的内容传给该处理函数作为参数。比如__setup("elevator=", elevator_setup);定义之后,.init.setup段就多了这样一个结构:
include/asm-generic/vmlinux.lds.h
#define INIT_SETUP(initsetup_align) \
. = ALIGN(initsetup_align); \
VMLINUX_SYMBOL(__setup_start) = .; \
*(.init.setup) \
VMLINUX_SYMBOL(__setup_end) = .;
找到为”elevator=”的时候,就会执行elevator_setup(“deadline”);“deadline”就是内核启动参数传过来的。
#define __setup(str, fn) \
__setup_param(str, fn, fn, 0)
/* NOTE: fn is as per module_param, not __setup! Emits warning if fn
* returns non-zero. */
#define early_param(str, fn) \
__setup_param(str, fn, fn, 1)
start_kernel()->parse_early_param()->parse_early_options()->parse_args()->do_early_param()->(p->setup_func(val))
static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct deadline_data *dd;
struct elevator_queue *eq;
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
if (!dd) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
eq->elevator_data = dd;
INIT_LIST_HEAD(&dd->fifo_list[READ]);
INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
dd->sort_list[READ] = RB_ROOT;
dd->sort_list[WRITE] = RB_ROOT;
dd->fifo_expire[READ] = read_expire;
dd->fifo_expire[WRITE] = write_expire;
dd->writes_starved = writes_starved;
dd->front_merges = 1;
dd->fifo_batch = fifo_batch;
spin_lock_irq(q->queue_lock);
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
return 0;
}
(a) elevator_alloc()分配和初始化一个调度器的实例elevator_queue,其成员type就是刚找到的elevator_type。