linux i/o path_20070116

37
generic_file_aio_read() generic_file_direct_IO() do_generic_file_read() do_generic_mapping_read() vfs_read() sys_read() do_sync_read() file->f_op->read() mm/filemap.c fs/read_write.c include/linux/fs.h fs/ext2/file.c filp->f_op->aio_read() Page Cache Generic Block Layer Device Driver Request Queue Elevator I/O Scheduler fget_light()

Upload: roy-lee

Post on 11-May-2015

2.198 views

Category:

Documents


1 download

DESCRIPTION

outdated

TRANSCRIPT

Page 1: Linux I/O path_20070116

1

generic_file_aio_read()

generic_file_direct_IO() do_generic_file_read()

do_generic_mapping_read()

vfs_read()

sys_read()

do_sync_read()file->f_op->read()

mm/filemap.c

fs/read_write.c

include/linux/fs.h

fs/ext2/file.c

filp->f_op->aio_read()

Page Cache

Generic Block Layer

Device Driver

Request Queue

Elevator

I/O Scheduler

fget_light()

Page 2: Linux I/O path_20070116

2

mm/filemap.c

find page

page ok

page

not

up to date

readpage

no

cached page

readpage

error

out

continue

do_generic_mapping_read()

page cache

readahead

Page 3: Linux I/O path_20070116

3

submit_bio()

mpage_bio_submit()

do_mpage_readpage()

do_generic_mapping_read()

mm/filemap.c

mm/readahead.c

fs/mpage.c

fs/ext2/inode.c

ext2_readpages()

mpage_readpages()

page_cache_readahead()

blockable_page_cache_readahead()

__do_page_cache_readahead()

read_pages()

mapping->a_ops->readpages()

ext2_readpage()

mpage_readpage()

mapping->a_ops->readpage()

ext2_get_block()

Generic Block Layer

Device Driver

Request Queue

Elevator

I/O Scheduler

Page 4: Linux I/O path_20070116

4

__make_request(q,bio)

elv_merge(q,req,bio)

get_request_wait()

init_request_from_bio()

add_request()

generic_make_request(bio)

q->make_request_fn(q,bio)

submit_bio()

__elv_add_request()

elv_insert()

include/linux/blkdev.h

mm/mpage.c

block/ll_rw_blk.c

block/elevator.c

block/as-iosched.c

e->ops->elevator_add_req_fn(q, rq)

elv_merged_request()

ll_merge_requests_fn()

e->ops->elevator_merged_fn()

q->back_merge_fn()

Device Driver

Request Queue

I/O Scheduler

elv_may_queue()

get_request()

current_io_context()

blk_alloc_request()

BLK_TA_INSERT

BLK_TA_QUEUE

BLK_TA_MERGE

BLK_TA_SLEEPRQ

BLK_TA_GETRQ

Page 5: Linux I/O path_20070116

5

ide_init_queue()

blk_init_queue_node(do_ide_request, ...)

q->request_fn = rfn

do_ide_request()

ide_do_request()

hwif_init()

ideprobe_init()

ide_generic_init()

init_irq()

request_irq(hwif->irq,&ide_intr, ...)

ide_intr()

register IRQ handler

register I/O request dispatcher

drivers/ide/ide-probes.c

drivers/ide/ide-generic.c

block/ll_rw_blk.c

drivers/ide-io.c

Page 6: Linux I/O path_20070116

6

Disk

elv_next_request()

ide_do_request()

__elv_add_request()

elv_insert()

q->request_fn()

add_request()

rq = __elv_next_request(q)

list_add_tail()

interrupt for completion

include/linux/blkdev.h

block/ll_rw_blk.c

block/elevator.c

block/as-iosched.c

q->make_request_fn()

Disk request queue without an I/O scheduler

start_request()

BLK_TA_INSERT

BLK_TA_PLUG

BLK_TA_ISSUE(D)

Page 7: Linux I/O path_20070116

7

Disk

elv_next_request()

ide_do_request()

start_request()

q->make_request_fn()

__elv_add_request()

elv_insert()

q->request_fn()

add_request()

interrupt for completion

rq = __elv_next_request(q)

e->ops->elevator_add_req_fn(q, rq)

e->ops->elevator_dispatch_fn()

include/linux/blkdev.h

block/ll_rw_blk.c

block/elevator.c

block/as-iosched.c

Disk request queue with an I/O scheduler

BLK_TA_ISSUE(D)

BLK_TA_INSERT

BLK_TA_PLUG

Page 8: Linux I/O path_20070116

8

ide_do_request()

rq = elv_next_request(drive->queue)

start_request(drive, rq)

rq = __elv_next_request(q)

q->elevator->ops->elevator_dispatch_fn(q, 0))

ISSUE

block/elevator.c

drivers/ide/ide-io.c

block/ll_rw_blk.c

block/as-iosched.c

Disk

static inline struct request *__elv_next_request(request_queue_t *q)

{

struct request *rq;

while (1) {

while (!list_empty(&q->queue_head)) {

rq = list_entry_rq(q->queue_head.next);

if (blk_do_ordered(q, &rq))

return rq;

}

if (!q->elevator->ops->elevator_dispatch_fn(q, 0))

return NULL;

}

}

Page 9: Linux I/O path_20070116

9

static int noop_dispatch(request_queue_t *q, int force)

{

struct noop_data *nd = q->elevator->elevator_data;

if (!list_empty(&nd->queue)) {

struct request *rq;

rq = list_entry(nd->queue.next, struct request, queuelist);

list_del_init(&rq->queuelist);

elv_dispatch_sort(q, rq);

return 1;

}

return 0;

}

static void noop_add_request(request_queue_t *q, struct request *rq)

{

struct noop_data *nd = q->elevator->elevator_data;

list_add_tail(&rq->queuelist, &nd->queue);

}

static void *noop_init_queue(request_queue_t *q, elevator_t *e)

{

struct noop_data *nd;

nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);

if (!nd)

return NULL;

INIT_LIST_HEAD(&nd->queue);

return nd;

}

Disk

Eevator private

data structure

block/noop-iosched.c

Page 10: Linux I/O path_20070116

10

struct deadline_data {

/* requests (deadline_rq s) are present on both sort_list and fifo_list */

struct rb_root sort_list[2];

struct list_head fifo_list[2];

/* next in sort order. read, write or both are NULL */

struct request *next_rq[2];

unsigned int batching; /* number of sequential requests made */

sector_t last_sector; /* head position */

unsigned int starved; /* times reads have starved writes */

/* settings that change how the i/o scheduler behaves */

int fifo_expire[2];

int fifo_batch;

int writes_starved;

int front_merges;

};

struct rb_root sort_list[READ]

struct list_head fifo_list[READ]

6 5 4

6 4 5

7 8 9

9 8 7struct rb_root sort_list[WRITE]

struct list_head fifo_list[WRITE]

next_rq[WRITE]

next_rq[READ]

The runtime data of Deadline i/o scheduler block/deadline-iosched.c

Page 11: Linux I/O path_20070116

11

static void

deadline_add_request(struct request_queue *q, struct request *rq)

{

struct deadline_data *dd = q->elevator->elevator_data;

const int data_dir = rq_data_dir(rq);

deadline_add_rq_rb(dd, rq);

/*

* set expire time (only used for reads) and add to fifo list

*/

rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);

list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);

}

struct rb_root sort_list[READ]

struct list_head fifo_list[READ]

6 5 4

6 4 5

7 8 9

9 8 7struct rb_root sort_list[WRITE]

struct list_head fifo_list[WRITE]

next_rq[WRITE]

next_rq[READ]

Add a request to both rb tree and fifo list

block/deadline-iosched.c

Page 12: Linux I/O path_20070116

12

struct rb_root sort_list[READ]

struct list_head fifo_list[READ]

6 5 4

6 4 5

7 8 9

9 8 7struct rb_root sort_list[WRITE]

struct list_head fifo_list[WRITE]

next_rq[WRITE]

next_rq[READ]

if (dd->next_rq[WRITE])

rq = dd->next_rq[WRITE];

else

rq = dd->next_rq[READ];

if (rq) {

/* we have a "next request" */

if (dd->last_sector != rq->sector)

/* end the batch on a non sequential request */

dd->batching += dd->fifo_batch;

if (dd->batching < dd->fifo_batch)

/* we are still entitled to batch */

goto dispatch_request;

}

1.Check if we are running a sequential batch, and it is still entitled.

block/deadline-iosched.c

Page 13: Linux I/O path_20070116

13

struct rb_root sort_list[READ]

struct list_head fifo_list[READ]

6 5 4

6 4 5

7 8 9

9 8 7struct rb_root sort_list[WRITE]

struct list_head fifo_list[WRITE]

2. If we are not running a batch. Choose a new direction to serve requests.

A read request is always favored, unless write has been starved.

if (reads) {

if (writes && (dd->starved++ >= dd->writes_starved))

goto dispatch_writes;

data_dir = READ;

goto dispatch_find_request;

}

if (writes) {

dispatch_writes:

dd->starved = 0;

data_dir = WRITE;

goto dispatch_find_request;

}

block/deadline-iosched.c

Page 14: Linux I/O path_20070116

14

3.Choose an appropriate request..

If the first request of the fifo list has expired, serve it.

Otherwise, behave as an “One-way Elevator”

dispatch_find_request:

if (deadline_check_fifo(dd, data_dir)) {

dd->batching = 0;

rq = rq_entry_fifo(dd->fifo_list[data_dir].next);

} else if (dd->next_rq[data_dir]) {

rq = dd->next_rq[data_dir];

} else {

struct rb_node *node;

dd->batching = 0;

node = rb_first(&dd->sort_list[data_dir]);

if (node)

rq = rb_entry_rq(node);

}

struct rb_root sort_list[READ]

struct list_head fifo_list[READ]

6 5 4

6 4 5

next_rq[READ]

struct rb_root sort_list[READ]6 5 4 next_rq[READ]

block/deadline-iosched.c

Page 15: Linux I/O path_20070116

15

Dispatch the request, remove it from the elevator’s private queue

and put it in the dispatch queue.

Also update the information about the “last” and the “next” request.

static void

deadline_move_request(struct deadline_data *dd, struct request *rq)

{

const int data_dir = rq_data_dir(rq);

struct rb_node *rbnext = rb_next(&rq->rb_node);

dd->next_rq[READ] = NULL;

dd->next_rq[WRITE] = NULL;

if (rbnext)

dd->next_rq[data_dir] = rb_entry_rq(rbnext);

dd->last_sector = rq->sector + rq->nr_sectors;

deadline_move_to_dispatch(dd, rq);

}

Disk

block/deadline-iosched.c

next_rq[READ]

struct rb_root sort_list[READ]

struct list_head fifo_list[READ]

6 5

4

6 4 5

Suppose the request 4 was picked in the previous step.

Page 16: Linux I/O path_20070116

16

static void as_add_request(request_queue_t *q, struct request *rq)

{

struct as_data *ad = q->elevator->elevator_data;

int data_dir;

RQ_SET_STATE(rq, AS_RQ_NEW);

data_dir = rq_is_sync(rq);

rq->elevator_private = as_get_io_context(q->node);

if (RQ_IOC(rq)) {

as_update_iohist(ad, RQ_IOC(rq)->aic, rq);

atomic_inc(&RQ_IOC(rq)->aic->nr_queued);

}

as_add_rq_rb(ad, rq);

/*

* set expire time (only used for reads) and add to fifo list

*/

rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]);

list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]);

as_update_rq(ad, rq); /* keep state machine up to date */

RQ_SET_STATE(rq, AS_RQ_QUEUED);

}

block/as-iosched.c

Page 17: Linux I/O path_20070116

17

as_add_request()

as_get_io_context(q->node)

as_add_rq_rb()

rq_set_fifo_time()

list_add_tail()

get_io_context()

alloc_as_io_context()

as_update_thinktime()

as_update_seekdist()

as_update_iohist()

current_io_context()

task_struct io_context as_io_context

request request

task_struct io_context as_io_context

request request request

include/linux/list.h

block/ll_rw_blk.c

include/linux/elevator.h

block/as-iosched.c

as_update_rq(ad, rq)

as_choose_req()

as_can_break_anticipation()

as_antic_stop()

del_timer()

kblockd_schedule_work()

as_update_iohist()

Page 18: Linux I/O path_20070116

18

as_add_request()

as_get_io_context(q->node)

as_add_rq_rb()

rq_set_fifo_time()

list_add_tail()

get_io_context()

alloc_as_io_context()

as_update_thinktime()

as_update_seekdist()

as_update_iohist()

current_io_context()

task_struct io_context as_io_context

request request

task_struct io_context as_io_context

request request request

include/linux/list.h

block/ll_rw_blk.c

include/linux/elevator.h

block/as-iosched.c

as_update_rq(ad, rq)

as_choose_req()

as_can_break_anticipation()

as_antic_stop()

del_timer()

kblockd_schedule_work()

as_update_iohist()

Page 19: Linux I/O path_20070116

19

static void

as_update_iohist(struct as_data *ad, struct as_io_context *aic, struct request *rq)

{

...

if (data_dir == REQ_SYNC) {

unsigned long in_flight = atomic_read(&aic->nr_queued)

+ atomic_read(&aic->nr_dispatched);

spin_lock(&aic->lock);

if (test_bit(AS_TASK_IORUNNING, &aic->state) ||

test_bit(AS_TASK_IOSTARTED, &aic->state)) {

/* Calculate read -> read thinktime */

if (test_bit(AS_TASK_IORUNNING, &aic->state) && in_flight == 0) {

thinktime = jiffies - aic->last_end_request;

thinktime = min(thinktime, MAX_THINKTIME-1);

}

as_update_thinktime(ad, aic, thinktime);

/* Calculate read -> read seek distance */

if (aic->last_request_pos < rq->sector)

seek_dist = rq->sector - aic->last_request_pos;

else

seek_dist = aic->last_request_pos - rq->sector;

as_update_seekdist(ad, aic, seek_dist);

}

aic->last_request_pos = rq->sector + rq->nr_sectors;

set_bit(AS_TASK_IOSTARTED, &aic->state);

spin_unlock(&aic->lock);

}

}

Page 20: Linux I/O path_20070116

20

as_add_request()

as_get_io_context(q->node)

as_add_rq_rb()

rq_set_fifo_time()

list_add_tail()

get_io_context()

alloc_as_io_context()

as_update_thinktime()

as_update_seekdist()

as_update_iohist()

current_io_context()

task_struct io_context as_io_context

request request

task_struct io_context as_io_context

request request request

include/linux/list.h

block/ll_rw_blk.c

include/linux/elevator.h

block/as-iosched.c

as_update_rq(ad, rq)

as_choose_req()

as_can_break_anticipation()

as_antic_stop()

del_timer()

kblockd_schedule_work()

as_update_iohist()

Page 21: Linux I/O path_20070116

21

as_move_to_dispatch() as_antic_waitreq()

as_antic_waitnext()as_antic_timeout()

as_add_rq_rb()

as_update_rq()

as_antic_stop()

as_add_request() as_dispatch_request() as_completed_request()

ANTIC_OFFANTIC_FINISHED ANTIC_WAIT_REQ ANTIC_WAIT_NEXT

kblockd_schedule_work()

enum anticipation_status {

ANTIC_OFF = 0, /* Not anticipating (normal operation) */

ANTIC_WAIT_REQ, /* The last read has not yet completed */

ANTIC_WAIT_NEXT, /* Currently anticipating a request vs

last read (which has completed) */

ANTIC_FINISHED, /* Anticipating but have found a candidate or timed out */

};

Page 22: Linux I/O path_20070116

22

/*

* This is called directly by the functions in this file to stop anticipation.

* We kill the timer and schedule a call to the request_fn asap.

*/

static void as_antic_stop(struct as_data *ad)

{

int status = ad->antic_status;

if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) {

if (status == ANTIC_WAIT_NEXT)

del_timer(&ad->antic_timer);

ad->antic_status = ANTIC_FINISHED;

/* see as_work_handler */

kblockd_schedule_work(&ad->antic_work);

}

}

Page 23: Linux I/O path_20070116

23

/*

* as_update_rq must be called whenever a request (rq) is added to

* the sort_list. This function keeps caches up to date, and checks if the

* request might be one we are "anticipating"

*/

static void as_update_rq(struct as_data *ad, struct request *rq)

{

const int data_dir = rq_is_sync(rq);

/* keep the next_rq cache up to date */

ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]);

/*

* have we been anticipating this request?

* or does it come from the same process as the one we are anticipating

* for?

*/

if (ad->antic_status == ANTIC_WAIT_REQ

|| ad->antic_status == ANTIC_WAIT_NEXT) {

if (as_can_break_anticipation(ad, rq))

as_antic_stop(ad);

}

}

Page 24: Linux I/O path_20070116

24

/*

* This is executed in a "deferred" process context, by kblockd. It calls the

* driver's request_fn so the driver can submit that request.

*

* IMPORTANT! This guy will reenter the elevator, so set up all queue global

* state before calling, and don't rely on any state over calls.

*

* FIXME! dispatch queue is not a queue at all!

*/

static void as_work_handler(void *data)

{

struct request_queue *q = data;

unsigned long flags;

spin_lock_irqsave(q->queue_lock, flags);

blk_start_queueing(q);

spin_unlock_irqrestore(q->queue_lock, flags);

}

Page 25: Linux I/O path_20070116

25

/*

* as_antic_timeout is the timer function set by as_antic_waitnext.

*/

static void as_antic_timeout(unsigned long data)

{

struct request_queue *q = (struct request_queue *)data;

struct as_data *ad = q->elevator->elevator_data;

unsigned long flags;

spin_lock_irqsave(q->queue_lock, flags);

if (ad->antic_status == ANTIC_WAIT_REQ

|| ad->antic_status == ANTIC_WAIT_NEXT) {

struct as_io_context *aic = ad->io_context->aic;

ad->antic_status = ANTIC_FINISHED;

kblockd_schedule_work(&ad->antic_work);

if (aic->ttime_samples == 0) {

/* process anticipated on has exited or timed out*/

ad->exit_prob = (7*ad->exit_prob + 256)/8;

}

if (!test_bit(AS_TASK_RUNNING, &aic->state)) {

/* process not "saved" by a cooperating request */

ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8;

}

}

spin_unlock_irqrestore(q->queue_lock, flags);

}

Page 26: Linux I/O path_20070116

26

static void as_put_io_context(struct request *rq)

{

struct as_io_context *aic;

if (unlikely(!RQ_IOC(rq)))

return;

aic = RQ_IOC(rq)->aic;

if (rq_is_sync(rq) && aic) {

spin_lock(&aic->lock);

set_bit(AS_TASK_IORUNNING, &aic->state);

aic->last_end_request = jiffies;

spin_unlock(&aic->lock);

}

put_io_context(RQ_IOC(rq));

}

Page 27: Linux I/O path_20070116

27

as_get_io_context()

get_io_context()

alloc_as_io_context()

as_put_io_context()

as_choose_req()

as_find_next_rq()

as_antic_expired()

as_antic_waitnext()

as_dispatch_request(()

as_move_to_dispatch()

as_batch_expired()

as_fifo_expired()

as_remove_queued_request()

as_completed_request()

update_write_batch()

as_update_rq()

as_can_anticipate()

as_can_break_anticipation()

as_close_req()

as_update_iohist()

as_update_seekdist()

as_update_thinktime()

as_antic_waitreq()

as_can_break_anticipation()

as_antic_stop()

kblockd_schedule_work()

as_antic_stop()

copy_io_context()

put_io_context()

elv_dispatch_sort()

as_update_iohist()

Page 28: Linux I/O path_20070116

28

create_workqueue("kblockd")

blk_dev_init()

genhd_device_init()genhd_device_init()

Page 29: Linux I/O path_20070116

29

void blk_start_queueing(request_queue_t *q)

{

if (!blk_queue_plugged(q))

q->request_fn(q);

else

__generic_unplug_device(q);

}

void __generic_unplug_device(request_queue_t *q)

{

if (unlikely(blk_queue_stopped(q)))

return;

if (!blk_remove_plug(q))

return;

q->request_fn(q);

}

int blk_remove_plug(request_queue_t *q)

{

WARN_ON(!irqs_disabled());

if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))

return 0;

del_timer(&q->unplug_timer);

return 1;

}

blk_start_queueing()

blk_queue_plugged()

__generic_unplug_device()

del_timer()

q->request_fn()

not plugged plugged

blk_remove_plug()

Page 30: Linux I/O path_20070116

30

void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)

{

q->nr_requests = BLKDEV_MAX_RQ;

blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);

blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);

q->make_request_fn = mfn;

q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;

q->backing_dev_info.state = 0;

q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;

blk_queue_max_sectors(q, SAFE_MAX_SECTORS);

blk_queue_hardsect_size(q, 512);

blk_queue_dma_alignment(q, 511);

blk_queue_congestion_threshold(q);

q->nr_batching = BLK_BATCH_REQ;

q->unplug_thresh = 4; /* hmm */

q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */

if (q->unplug_delay == 0)

q->unplug_delay = 1;

INIT_WORK(&q->unplug_work, blk_unplug_work, q);

q->unplug_timer.function = blk_unplug_timeout;

q->unplug_timer.data = (unsigned long)q;

blk_queue_activity_fn(q, NULL, NULL);

}

block/ll_rw_blk.c

Page 31: Linux I/O path_20070116

31

request_queue_t *

blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)

{

request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id);

if (!q)

return NULL;

q->node = node_id;

blk_init_free_list(q);

q->request_fn = rfn;

q->back_merge_fn = ll_back_merge_fn;

q->front_merge_fn = ll_front_merge_fn;

q->merge_requests_fn = ll_merge_requests_fn;

q->prep_rq_fn = NULL;

q->unplug_fn = generic_unplug_device;

q->queue_flags = (1 << QUEUE_FLAG_CLUSTER);

q->queue_lock = lock;

blk_queue_segment_boundary(q, 0xffffffff);

blk_queue_make_request(q, __make_request);

blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);

/* all done */

elevator_init(q, NULL);

}

include/linux/blkdev.h

Page 32: Linux I/O path_20070116

32

sys_read()

ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)

{

struct file *file;

ssize_t ret = -EBADF;

int fput_needed;

file = fget_light(fd, &fput_needed);

if (file) {

loff_t pos = file_pos_read(file);

ret = vfs_read(file, buf, count, &pos);

file_pos_write(file, pos);

fput_light(file, fput_needed);

}

return ret;

}

Page 33: Linux I/O path_20070116

33

vfs_read()ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)

{

if (!(file->f_mode & FMODE_READ))

return -EBADF;

if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))

return -EINVAL;

if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))

return -EFAULT;

ret = rw_verify_area(READ, file, pos, count);

if (ret >= 0) {

count = ret;

if (file->f_op->read)

ret = file->f_op->read(file, buf, count, pos);

else

ret = do_sync_read(file, buf, count, pos);

if (ret > 0) {

fsnotify_access(file->f_dentry);

current->rchar += ret;

}

current->syscr++;

}

return ret;

}

Page 34: Linux I/O path_20070116

34

do_sync_read()ssize_t do_sync_read(struct file *filp, char __user *buf,

size_t len, loff_t *ppos)

{

struct iovec iov = { .iov_base = buf, .iov_len = len };

struct kiocb kiocb;

ssize_t ret;

init_sync_kiocb(&kiocb, filp);

kiocb.ki_pos = *ppos;

kiocb.ki_left = len;

for (;;) {

ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);

if (ret != -EIOCBRETRY)

break;

wait_on_retry_sync_kiocb(&kiocb);

}

if (-EIOCBQUEUED == ret)

ret = wait_on_sync_kiocb(&kiocb);

*ppos = kiocb.ki_pos;

return ret;

}

Page 35: Linux I/O path_20070116

35

generic_file_aio_read() 1/2

ssize_t generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos)

{

struct file *filp = iocb->ki_filp;

ssize_t retval;

unsigned long seg;

size_t count;

loff_t *ppos = &iocb->ki_pos;

count = 0;

for (seg = 0; seg < nr_segs; seg++) {

const struct iovec *iv = &iov[seg];

...

}

/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */

if (filp->f_flags & O_DIRECT) {

...

}

Page 36: Linux I/O path_20070116

36

generic_file_aio_read() 2/2

retval = 0;

if (count) {

for (seg = 0; seg < nr_segs; seg++) {

read_descriptor_t desc;

desc.written = 0;

desc.arg.buf = iov[seg].iov_base;

desc.count = iov[seg].iov_len;

if (desc.count == 0)

continue;

desc.error = 0;

do_generic_file_read(filp,ppos,&desc,file_read_actor);

retval += desc.written;

if (desc.error) {

retval = retval ?: desc.error;

break;

}

}

}

out:

return retval;

}

Page 37: Linux I/O path_20070116

37

do_generic_file_read()

static inline void do_generic_file_read(struct file * filp,

loff_t *ppos,

read_descriptor_t * desc,

read_actor_t actor)

{

do_generic_mapping_read(filp->f_mapping,

&filp->f_ra,

filp,

ppos,

desc,

actor);

}