linux i/o path_20070116
DESCRIPTION
outdatedTRANSCRIPT
1
generic_file_aio_read()
generic_file_direct_IO() do_generic_file_read()
do_generic_mapping_read()
vfs_read()
sys_read()
do_sync_read()file->f_op->read()
mm/filemap.c
fs/read_write.c
include/linux/fs.h
fs/ext2/file.c
filp->f_op->aio_read()
Page Cache
Generic Block Layer
Device Driver
Request Queue
Elevator
I/O Scheduler
fget_light()
2
mm/filemap.c
find page
page ok
page
not
up to date
readpage
no
cached page
readpage
error
out
continue
do_generic_mapping_read()
page cache
readahead
3
submit_bio()
mpage_bio_submit()
do_mpage_readpage()
do_generic_mapping_read()
mm/filemap.c
mm/readahead.c
fs/mpage.c
fs/ext2/inode.c
ext2_readpages()
mpage_readpages()
page_cache_readahead()
blockable_page_cache_readahead()
__do_page_cache_readahead()
read_pages()
mapping->a_ops->readpages()
ext2_readpage()
mpage_readpage()
mapping->a_ops->readpage()
ext2_get_block()
Generic Block Layer
Device Driver
Request Queue
Elevator
I/O Scheduler
4
__make_request(q,bio)
elv_merge(q,req,bio)
get_request_wait()
init_request_from_bio()
add_request()
generic_make_request(bio)
q->make_request_fn(q,bio)
submit_bio()
__elv_add_request()
elv_insert()
include/linux/blkdev.h
mm/mpage.c
block/ll_rw_blk.c
block/elevator.c
block/as-iosched.c
e->ops->elevator_add_req_fn(q, rq)
elv_merged_request()
ll_merge_requests_fn()
e->ops->elevator_merged_fn()
q->back_merge_fn()
Device Driver
Request Queue
I/O Scheduler
elv_may_queue()
get_request()
current_io_context()
blk_alloc_request()
BLK_TA_INSERT
BLK_TA_QUEUE
BLK_TA_MERGE
BLK_TA_SLEEPRQ
BLK_TA_GETRQ
5
ide_init_queue()
blk_init_queue_node(do_ide_request, ...)
q->request_fn = rfn
do_ide_request()
ide_do_request()
hwif_init()
ideprobe_init()
ide_generic_init()
init_irq()
request_irq(hwif->irq,&ide_intr, ...)
ide_intr()
register IRQ handler
register I/O request dispatcher
drivers/ide/ide-probes.c
drivers/ide/ide-generic.c
block/ll_rw_blk.c
drivers/ide-io.c
6
Disk
elv_next_request()
ide_do_request()
__elv_add_request()
elv_insert()
q->request_fn()
add_request()
rq = __elv_next_request(q)
list_add_tail()
interrupt for completion
include/linux/blkdev.h
block/ll_rw_blk.c
block/elevator.c
block/as-iosched.c
q->make_request_fn()
Disk request queue without an I/O scheduler
start_request()
BLK_TA_INSERT
BLK_TA_PLUG
BLK_TA_ISSUE(D)
7
Disk
elv_next_request()
ide_do_request()
start_request()
q->make_request_fn()
__elv_add_request()
elv_insert()
q->request_fn()
add_request()
interrupt for completion
rq = __elv_next_request(q)
e->ops->elevator_add_req_fn(q, rq)
e->ops->elevator_dispatch_fn()
include/linux/blkdev.h
block/ll_rw_blk.c
block/elevator.c
block/as-iosched.c
Disk request queue with an I/O scheduler
BLK_TA_ISSUE(D)
BLK_TA_INSERT
BLK_TA_PLUG
8
ide_do_request()
rq = elv_next_request(drive->queue)
start_request(drive, rq)
rq = __elv_next_request(q)
q->elevator->ops->elevator_dispatch_fn(q, 0))
ISSUE
block/elevator.c
drivers/ide/ide-io.c
block/ll_rw_blk.c
block/as-iosched.c
Disk
static inline struct request *__elv_next_request(request_queue_t *q)
{
struct request *rq;
while (1) {
while (!list_empty(&q->queue_head)) {
rq = list_entry_rq(q->queue_head.next);
if (blk_do_ordered(q, &rq))
return rq;
}
if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
return NULL;
}
}
9
static int noop_dispatch(request_queue_t *q, int force)
{
struct noop_data *nd = q->elevator->elevator_data;
if (!list_empty(&nd->queue)) {
struct request *rq;
rq = list_entry(nd->queue.next, struct request, queuelist);
list_del_init(&rq->queuelist);
elv_dispatch_sort(q, rq);
return 1;
}
return 0;
}
static void noop_add_request(request_queue_t *q, struct request *rq)
{
struct noop_data *nd = q->elevator->elevator_data;
list_add_tail(&rq->queuelist, &nd->queue);
}
static void *noop_init_queue(request_queue_t *q, elevator_t *e)
{
struct noop_data *nd;
nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
if (!nd)
return NULL;
INIT_LIST_HEAD(&nd->queue);
return nd;
}
Disk
Eevator private
data structure
block/noop-iosched.c
10
struct deadline_data {
/* requests (deadline_rq s) are present on both sort_list and fifo_list */
struct rb_root sort_list[2];
struct list_head fifo_list[2];
/* next in sort order. read, write or both are NULL */
struct request *next_rq[2];
unsigned int batching; /* number of sequential requests made */
sector_t last_sector; /* head position */
unsigned int starved; /* times reads have starved writes */
/* settings that change how the i/o scheduler behaves */
int fifo_expire[2];
int fifo_batch;
int writes_starved;
int front_merges;
};
struct rb_root sort_list[READ]
struct list_head fifo_list[READ]
6 5 4
6 4 5
7 8 9
9 8 7struct rb_root sort_list[WRITE]
struct list_head fifo_list[WRITE]
next_rq[WRITE]
next_rq[READ]
The runtime data of Deadline i/o scheduler block/deadline-iosched.c
11
static void
deadline_add_request(struct request_queue *q, struct request *rq)
{
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq);
deadline_add_rq_rb(dd, rq);
/*
* set expire time (only used for reads) and add to fifo list
*/
rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);
list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
}
struct rb_root sort_list[READ]
struct list_head fifo_list[READ]
6 5 4
6 4 5
7 8 9
9 8 7struct rb_root sort_list[WRITE]
struct list_head fifo_list[WRITE]
next_rq[WRITE]
next_rq[READ]
Add a request to both rb tree and fifo list
block/deadline-iosched.c
12
struct rb_root sort_list[READ]
struct list_head fifo_list[READ]
6 5 4
6 4 5
7 8 9
9 8 7struct rb_root sort_list[WRITE]
struct list_head fifo_list[WRITE]
next_rq[WRITE]
next_rq[READ]
if (dd->next_rq[WRITE])
rq = dd->next_rq[WRITE];
else
rq = dd->next_rq[READ];
if (rq) {
/* we have a "next request" */
if (dd->last_sector != rq->sector)
/* end the batch on a non sequential request */
dd->batching += dd->fifo_batch;
if (dd->batching < dd->fifo_batch)
/* we are still entitled to batch */
goto dispatch_request;
}
1.Check if we are running a sequential batch, and it is still entitled.
block/deadline-iosched.c
13
struct rb_root sort_list[READ]
struct list_head fifo_list[READ]
6 5 4
6 4 5
7 8 9
9 8 7struct rb_root sort_list[WRITE]
struct list_head fifo_list[WRITE]
2. If we are not running a batch. Choose a new direction to serve requests.
A read request is always favored, unless write has been starved.
if (reads) {
if (writes && (dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
data_dir = READ;
goto dispatch_find_request;
}
if (writes) {
dispatch_writes:
dd->starved = 0;
data_dir = WRITE;
goto dispatch_find_request;
}
block/deadline-iosched.c
14
3.Choose an appropriate request..
If the first request of the fifo list has expired, serve it.
Otherwise, behave as an “One-way Elevator”
dispatch_find_request:
if (deadline_check_fifo(dd, data_dir)) {
dd->batching = 0;
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
} else if (dd->next_rq[data_dir]) {
rq = dd->next_rq[data_dir];
} else {
struct rb_node *node;
dd->batching = 0;
node = rb_first(&dd->sort_list[data_dir]);
if (node)
rq = rb_entry_rq(node);
}
struct rb_root sort_list[READ]
struct list_head fifo_list[READ]
6 5 4
6 4 5
next_rq[READ]
struct rb_root sort_list[READ]6 5 4 next_rq[READ]
block/deadline-iosched.c
15
Dispatch the request, remove it from the elevator’s private queue
and put it in the dispatch queue.
Also update the information about the “last” and the “next” request.
static void
deadline_move_request(struct deadline_data *dd, struct request *rq)
{
const int data_dir = rq_data_dir(rq);
struct rb_node *rbnext = rb_next(&rq->rb_node);
dd->next_rq[READ] = NULL;
dd->next_rq[WRITE] = NULL;
if (rbnext)
dd->next_rq[data_dir] = rb_entry_rq(rbnext);
dd->last_sector = rq->sector + rq->nr_sectors;
deadline_move_to_dispatch(dd, rq);
}
Disk
block/deadline-iosched.c
next_rq[READ]
struct rb_root sort_list[READ]
struct list_head fifo_list[READ]
6 5
4
6 4 5
Suppose the request 4 was picked in the previous step.
16
static void as_add_request(request_queue_t *q, struct request *rq)
{
struct as_data *ad = q->elevator->elevator_data;
int data_dir;
RQ_SET_STATE(rq, AS_RQ_NEW);
data_dir = rq_is_sync(rq);
rq->elevator_private = as_get_io_context(q->node);
if (RQ_IOC(rq)) {
as_update_iohist(ad, RQ_IOC(rq)->aic, rq);
atomic_inc(&RQ_IOC(rq)->aic->nr_queued);
}
as_add_rq_rb(ad, rq);
/*
* set expire time (only used for reads) and add to fifo list
*/
rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]);
list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]);
as_update_rq(ad, rq); /* keep state machine up to date */
RQ_SET_STATE(rq, AS_RQ_QUEUED);
}
block/as-iosched.c
17
as_add_request()
as_get_io_context(q->node)
as_add_rq_rb()
rq_set_fifo_time()
list_add_tail()
get_io_context()
alloc_as_io_context()
as_update_thinktime()
as_update_seekdist()
as_update_iohist()
current_io_context()
task_struct io_context as_io_context
request request
task_struct io_context as_io_context
request request request
include/linux/list.h
block/ll_rw_blk.c
include/linux/elevator.h
block/as-iosched.c
as_update_rq(ad, rq)
as_choose_req()
as_can_break_anticipation()
as_antic_stop()
del_timer()
kblockd_schedule_work()
as_update_iohist()
18
as_add_request()
as_get_io_context(q->node)
as_add_rq_rb()
rq_set_fifo_time()
list_add_tail()
get_io_context()
alloc_as_io_context()
as_update_thinktime()
as_update_seekdist()
as_update_iohist()
current_io_context()
task_struct io_context as_io_context
request request
task_struct io_context as_io_context
request request request
include/linux/list.h
block/ll_rw_blk.c
include/linux/elevator.h
block/as-iosched.c
as_update_rq(ad, rq)
as_choose_req()
as_can_break_anticipation()
as_antic_stop()
del_timer()
kblockd_schedule_work()
as_update_iohist()
19
static void
as_update_iohist(struct as_data *ad, struct as_io_context *aic, struct request *rq)
{
...
if (data_dir == REQ_SYNC) {
unsigned long in_flight = atomic_read(&aic->nr_queued)
+ atomic_read(&aic->nr_dispatched);
spin_lock(&aic->lock);
if (test_bit(AS_TASK_IORUNNING, &aic->state) ||
test_bit(AS_TASK_IOSTARTED, &aic->state)) {
/* Calculate read -> read thinktime */
if (test_bit(AS_TASK_IORUNNING, &aic->state) && in_flight == 0) {
thinktime = jiffies - aic->last_end_request;
thinktime = min(thinktime, MAX_THINKTIME-1);
}
as_update_thinktime(ad, aic, thinktime);
/* Calculate read -> read seek distance */
if (aic->last_request_pos < rq->sector)
seek_dist = rq->sector - aic->last_request_pos;
else
seek_dist = aic->last_request_pos - rq->sector;
as_update_seekdist(ad, aic, seek_dist);
}
aic->last_request_pos = rq->sector + rq->nr_sectors;
set_bit(AS_TASK_IOSTARTED, &aic->state);
spin_unlock(&aic->lock);
}
}
20
as_add_request()
as_get_io_context(q->node)
as_add_rq_rb()
rq_set_fifo_time()
list_add_tail()
get_io_context()
alloc_as_io_context()
as_update_thinktime()
as_update_seekdist()
as_update_iohist()
current_io_context()
task_struct io_context as_io_context
request request
task_struct io_context as_io_context
request request request
include/linux/list.h
block/ll_rw_blk.c
include/linux/elevator.h
block/as-iosched.c
as_update_rq(ad, rq)
as_choose_req()
as_can_break_anticipation()
as_antic_stop()
del_timer()
kblockd_schedule_work()
as_update_iohist()
21
as_move_to_dispatch() as_antic_waitreq()
as_antic_waitnext()as_antic_timeout()
as_add_rq_rb()
as_update_rq()
as_antic_stop()
as_add_request() as_dispatch_request() as_completed_request()
ANTIC_OFFANTIC_FINISHED ANTIC_WAIT_REQ ANTIC_WAIT_NEXT
kblockd_schedule_work()
enum anticipation_status {
ANTIC_OFF = 0, /* Not anticipating (normal operation) */
ANTIC_WAIT_REQ, /* The last read has not yet completed */
ANTIC_WAIT_NEXT, /* Currently anticipating a request vs
last read (which has completed) */
ANTIC_FINISHED, /* Anticipating but have found a candidate or timed out */
};
22
/*
* This is called directly by the functions in this file to stop anticipation.
* We kill the timer and schedule a call to the request_fn asap.
*/
static void as_antic_stop(struct as_data *ad)
{
int status = ad->antic_status;
if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) {
if (status == ANTIC_WAIT_NEXT)
del_timer(&ad->antic_timer);
ad->antic_status = ANTIC_FINISHED;
/* see as_work_handler */
kblockd_schedule_work(&ad->antic_work);
}
}
23
/*
* as_update_rq must be called whenever a request (rq) is added to
* the sort_list. This function keeps caches up to date, and checks if the
* request might be one we are "anticipating"
*/
static void as_update_rq(struct as_data *ad, struct request *rq)
{
const int data_dir = rq_is_sync(rq);
/* keep the next_rq cache up to date */
ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]);
/*
* have we been anticipating this request?
* or does it come from the same process as the one we are anticipating
* for?
*/
if (ad->antic_status == ANTIC_WAIT_REQ
|| ad->antic_status == ANTIC_WAIT_NEXT) {
if (as_can_break_anticipation(ad, rq))
as_antic_stop(ad);
}
}
24
/*
* This is executed in a "deferred" process context, by kblockd. It calls the
* driver's request_fn so the driver can submit that request.
*
* IMPORTANT! This guy will reenter the elevator, so set up all queue global
* state before calling, and don't rely on any state over calls.
*
* FIXME! dispatch queue is not a queue at all!
*/
static void as_work_handler(void *data)
{
struct request_queue *q = data;
unsigned long flags;
spin_lock_irqsave(q->queue_lock, flags);
blk_start_queueing(q);
spin_unlock_irqrestore(q->queue_lock, flags);
}
25
/*
* as_antic_timeout is the timer function set by as_antic_waitnext.
*/
static void as_antic_timeout(unsigned long data)
{
struct request_queue *q = (struct request_queue *)data;
struct as_data *ad = q->elevator->elevator_data;
unsigned long flags;
spin_lock_irqsave(q->queue_lock, flags);
if (ad->antic_status == ANTIC_WAIT_REQ
|| ad->antic_status == ANTIC_WAIT_NEXT) {
struct as_io_context *aic = ad->io_context->aic;
ad->antic_status = ANTIC_FINISHED;
kblockd_schedule_work(&ad->antic_work);
if (aic->ttime_samples == 0) {
/* process anticipated on has exited or timed out*/
ad->exit_prob = (7*ad->exit_prob + 256)/8;
}
if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
/* process not "saved" by a cooperating request */
ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8;
}
}
spin_unlock_irqrestore(q->queue_lock, flags);
}
26
static void as_put_io_context(struct request *rq)
{
struct as_io_context *aic;
if (unlikely(!RQ_IOC(rq)))
return;
aic = RQ_IOC(rq)->aic;
if (rq_is_sync(rq) && aic) {
spin_lock(&aic->lock);
set_bit(AS_TASK_IORUNNING, &aic->state);
aic->last_end_request = jiffies;
spin_unlock(&aic->lock);
}
put_io_context(RQ_IOC(rq));
}
27
as_get_io_context()
get_io_context()
alloc_as_io_context()
as_put_io_context()
as_choose_req()
as_find_next_rq()
as_antic_expired()
as_antic_waitnext()
as_dispatch_request(()
as_move_to_dispatch()
as_batch_expired()
as_fifo_expired()
as_remove_queued_request()
as_completed_request()
update_write_batch()
as_update_rq()
as_can_anticipate()
as_can_break_anticipation()
as_close_req()
as_update_iohist()
as_update_seekdist()
as_update_thinktime()
as_antic_waitreq()
as_can_break_anticipation()
as_antic_stop()
kblockd_schedule_work()
as_antic_stop()
copy_io_context()
put_io_context()
elv_dispatch_sort()
as_update_iohist()
28
create_workqueue("kblockd")
blk_dev_init()
genhd_device_init()genhd_device_init()
29
void blk_start_queueing(request_queue_t *q)
{
if (!blk_queue_plugged(q))
q->request_fn(q);
else
__generic_unplug_device(q);
}
void __generic_unplug_device(request_queue_t *q)
{
if (unlikely(blk_queue_stopped(q)))
return;
if (!blk_remove_plug(q))
return;
q->request_fn(q);
}
int blk_remove_plug(request_queue_t *q)
{
WARN_ON(!irqs_disabled());
if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
return 0;
del_timer(&q->unplug_timer);
return 1;
}
blk_start_queueing()
blk_queue_plugged()
__generic_unplug_device()
del_timer()
q->request_fn()
not plugged plugged
blk_remove_plug()
30
void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
{
q->nr_requests = BLKDEV_MAX_RQ;
blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
q->make_request_fn = mfn;
q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q->backing_dev_info.state = 0;
q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
blk_queue_hardsect_size(q, 512);
blk_queue_dma_alignment(q, 511);
blk_queue_congestion_threshold(q);
q->nr_batching = BLK_BATCH_REQ;
q->unplug_thresh = 4; /* hmm */
q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */
if (q->unplug_delay == 0)
q->unplug_delay = 1;
INIT_WORK(&q->unplug_work, blk_unplug_work, q);
q->unplug_timer.function = blk_unplug_timeout;
q->unplug_timer.data = (unsigned long)q;
blk_queue_activity_fn(q, NULL, NULL);
}
block/ll_rw_blk.c
31
request_queue_t *
blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
if (!q)
return NULL;
q->node = node_id;
blk_init_free_list(q);
q->request_fn = rfn;
q->back_merge_fn = ll_back_merge_fn;
q->front_merge_fn = ll_front_merge_fn;
q->merge_requests_fn = ll_merge_requests_fn;
q->prep_rq_fn = NULL;
q->unplug_fn = generic_unplug_device;
q->queue_flags = (1 << QUEUE_FLAG_CLUSTER);
q->queue_lock = lock;
blk_queue_segment_boundary(q, 0xffffffff);
blk_queue_make_request(q, __make_request);
blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
/* all done */
elevator_init(q, NULL);
}
include/linux/blkdev.h
32
sys_read()
ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
{
struct file *file;
ssize_t ret = -EBADF;
int fput_needed;
file = fget_light(fd, &fput_needed);
if (file) {
loff_t pos = file_pos_read(file);
ret = vfs_read(file, buf, count, &pos);
file_pos_write(file, pos);
fput_light(file, fput_needed);
}
return ret;
}
33
vfs_read()ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
if (!(file->f_mode & FMODE_READ))
return -EBADF;
if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
return -EFAULT;
ret = rw_verify_area(READ, file, pos, count);
if (ret >= 0) {
count = ret;
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos);
else
ret = do_sync_read(file, buf, count, pos);
if (ret > 0) {
fsnotify_access(file->f_dentry);
current->rchar += ret;
}
current->syscr++;
}
return ret;
}
34
do_sync_read()ssize_t do_sync_read(struct file *filp, char __user *buf,
size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
kiocb.ki_left = len;
for (;;) {
ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
if (ret != -EIOCBRETRY)
break;
wait_on_retry_sync_kiocb(&kiocb);
}
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
return ret;
}
35
generic_file_aio_read() 1/2
ssize_t generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos)
{
struct file *filp = iocb->ki_filp;
ssize_t retval;
unsigned long seg;
size_t count;
loff_t *ppos = &iocb->ki_pos;
count = 0;
for (seg = 0; seg < nr_segs; seg++) {
const struct iovec *iv = &iov[seg];
...
}
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (filp->f_flags & O_DIRECT) {
...
}
36
generic_file_aio_read() 2/2
retval = 0;
if (count) {
for (seg = 0; seg < nr_segs; seg++) {
read_descriptor_t desc;
desc.written = 0;
desc.arg.buf = iov[seg].iov_base;
desc.count = iov[seg].iov_len;
if (desc.count == 0)
continue;
desc.error = 0;
do_generic_file_read(filp,ppos,&desc,file_read_actor);
retval += desc.written;
if (desc.error) {
retval = retval ?: desc.error;
break;
}
}
}
out:
return retval;
}
37
do_generic_file_read()
static inline void do_generic_file_read(struct file * filp,
loff_t *ppos,
read_descriptor_t * desc,
read_actor_t actor)
{
do_generic_mapping_read(filp->f_mapping,
&filp->f_ra,
filp,
ppos,
desc,
actor);
}