]> git.openfabrics.org - ~shefty/rdma-dev.git/commitdiff
writeback: dirty ratelimit - think time compensation
authorWu Fengguang <fengguang.wu@intel.com>
Sun, 12 Jun 2011 01:25:42 +0000 (19:25 -0600)
committerWu Fengguang <fengguang.wu@intel.com>
Sun, 18 Dec 2011 06:20:27 +0000 (14:20 +0800)
Compensate the task's think time when computing the final pause time,
so that ->dirty_ratelimit can be executed accurately.

        think time := time spend outside of balance_dirty_pages()

In the rare case that the task slept longer than the 200ms period time
(result in negative pause time), the sleep time will be compensated in
the following periods, too, if it's less than 1 second.

Accumulated errors are carefully avoided as long as the max pause area
is not hitted.

Pseudo code:

        period = pages_dirtied / task_ratelimit;
        think = jiffies - dirty_paused_when;
        pause = period - think;

1) normal case: period > think

        pause = period - think
        dirty_paused_when = jiffies + pause
        nr_dirtied = 0

                             period time
              |===============================>|
                  think time      pause time
              |===============>|==============>|
        ------|----------------|---------------|------------------------
        dirty_paused_when   jiffies

2) no pause case: period <= think

        don't pause; reduce future pause time by:
        dirty_paused_when += period
        nr_dirtied = 0

                           period time
              |===============================>|
                                  think time
              |===================================================>|
        ------|--------------------------------+-------------------|----
        dirty_paused_when                                       jiffies

Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
include/linux/sched.h
include/trace/events/writeback.h
kernel/fork.c
mm/page-writeback.c

index 1c4f3e9b9bc50e52ea57623f5b886691fb52db14..984c3b2959784ed14b21fbe8811879ddcd62f63a 100644 (file)
@@ -1527,6 +1527,7 @@ struct task_struct {
         */
        int nr_dirtied;
        int nr_dirtied_pause;
+       unsigned long dirty_paused_when; /* start of a write-and-pause period */
 
 #ifdef CONFIG_LATENCYTOP
        int latency_record_count;
index 99d1d0decf88e41a7c0c038d463330e8351046b1..8588a891802339a2939dfc91e5630e3a826781d4 100644 (file)
@@ -300,12 +300,13 @@ TRACE_EVENT(balance_dirty_pages,
                 unsigned long dirty_ratelimit,
                 unsigned long task_ratelimit,
                 unsigned long dirtied,
+                unsigned long period,
                 long pause,
                 unsigned long start_time),
 
        TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
                dirty_ratelimit, task_ratelimit,
-               dirtied, pause, start_time),
+               dirtied, period, pause, start_time),
 
        TP_STRUCT__entry(
                __array(         char,  bdi, 32)
@@ -320,6 +321,8 @@ TRACE_EVENT(balance_dirty_pages,
                __field(unsigned int,   dirtied_pause)
                __field(unsigned long,  paused)
                __field(         long,  pause)
+               __field(unsigned long,  period)
+               __field(         long,  think)
        ),
 
        TP_fast_assign(
@@ -336,6 +339,9 @@ TRACE_EVENT(balance_dirty_pages,
                __entry->task_ratelimit = KBps(task_ratelimit);
                __entry->dirtied        = dirtied;
                __entry->dirtied_pause  = current->nr_dirtied_pause;
+               __entry->think          = current->dirty_paused_when == 0 ? 0 :
+                        (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
+               __entry->period         = period * 1000 / HZ;
                __entry->pause          = pause * 1000 / HZ;
                __entry->paused         = (jiffies - start_time) * 1000 / HZ;
        ),
@@ -346,7 +352,7 @@ TRACE_EVENT(balance_dirty_pages,
                  "bdi_setpoint=%lu bdi_dirty=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "dirtied=%u dirtied_pause=%u "
-                 "paused=%lu pause=%ld",
+                 "paused=%lu pause=%ld period=%lu think=%ld",
                  __entry->bdi,
                  __entry->limit,
                  __entry->setpoint,
@@ -358,7 +364,9 @@ TRACE_EVENT(balance_dirty_pages,
                  __entry->dirtied,
                  __entry->dirtied_pause,
                  __entry->paused,      /* ms */
-                 __entry->pause        /* ms */
+                 __entry->pause,       /* ms */
+                 __entry->period,      /* ms */
+                 __entry->think        /* ms */
          )
 );
 
index da4a6a10d088d2d575c4eeae9438f832efc6f41b..f8668cf6a32d616096c3fcedfd0dd5d6d13bd17e 100644 (file)
@@ -1296,6 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
        p->nr_dirtied = 0;
        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+       p->dirty_paused_when = 0;
 
        /*
         * Ok, make it visible to the rest of the system.
index 96b3e7aa705c3ebe12f02b52083e80683973a966..491932155825328f3ded187d5d9a28cabdb2edc6 100644 (file)
@@ -1016,6 +1016,7 @@ static void balance_dirty_pages(struct address_space *mapping,
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        unsigned long bdi_thresh;
+       long period;
        long pause = 0;
        long uninitialized_var(max_pause);
        bool dirty_exceeded = false;
@@ -1026,6 +1027,8 @@ static void balance_dirty_pages(struct address_space *mapping,
        unsigned long start_time = jiffies;
 
        for (;;) {
+               unsigned long now = jiffies;
+
                /*
                 * Unstable writes are a feature of certain networked
                 * filesystems (i.e. NFS) in which data may have been
@@ -1045,8 +1048,11 @@ static void balance_dirty_pages(struct address_space *mapping,
                 */
                freerun = dirty_freerun_ceiling(dirty_thresh,
                                                background_thresh);
-               if (nr_dirty <= freerun)
+               if (nr_dirty <= freerun) {
+                       current->dirty_paused_when = now;
+                       current->nr_dirtied = 0;
                        break;
+               }
 
                if (unlikely(!writeback_in_progress(bdi)))
                        bdi_start_background_writeback(bdi);
@@ -1104,10 +1110,21 @@ static void balance_dirty_pages(struct address_space *mapping,
                task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
                                                        RATELIMIT_CALC_SHIFT;
                if (unlikely(task_ratelimit == 0)) {
+                       period = max_pause;
                        pause = max_pause;
                        goto pause;
                }
-               pause = HZ * pages_dirtied / task_ratelimit;
+               period = HZ * pages_dirtied / task_ratelimit;
+               pause = period;
+               if (current->dirty_paused_when)
+                       pause -= now - current->dirty_paused_when;
+               /*
+                * For less than 1s think time (ext3/4 may block the dirtier
+                * for up to 800ms from time to time on 1-HDD; so does xfs,
+                * however at much less frequency), try to compensate it in
+                * future periods by updating the virtual time; otherwise just
+                * do a reset, as it may be a light dirtier.
+                */
                if (unlikely(pause <= 0)) {
                        trace_balance_dirty_pages(bdi,
                                                  dirty_thresh,
@@ -1118,8 +1135,16 @@ static void balance_dirty_pages(struct address_space *mapping,
                                                  dirty_ratelimit,
                                                  task_ratelimit,
                                                  pages_dirtied,
+                                                 period,
                                                  pause,
                                                  start_time);
+                       if (pause < -HZ) {
+                               current->dirty_paused_when = now;
+                               current->nr_dirtied = 0;
+                       } else if (period) {
+                               current->dirty_paused_when += period;
+                               current->nr_dirtied = 0;
+                       }
                        pause = 1; /* avoid resetting nr_dirtied_pause below */
                        break;
                }
@@ -1135,11 +1160,15 @@ pause:
                                          dirty_ratelimit,
                                          task_ratelimit,
                                          pages_dirtied,
+                                         period,
                                          pause,
                                          start_time);
                __set_current_state(TASK_KILLABLE);
                io_schedule_timeout(pause);
 
+               current->dirty_paused_when = now + pause;
+               current->nr_dirtied = 0;
+
                /*
                 * This is typically equal to (nr_dirty < dirty_thresh) and can
                 * also keep "1000+ dd on a slow USB stick" under control.
@@ -1167,11 +1196,10 @@ pause:
        if (!dirty_exceeded && bdi->dirty_exceeded)
                bdi->dirty_exceeded = 0;
 
-       current->nr_dirtied = 0;
        if (pause == 0) { /* in freerun area */
                current->nr_dirtied_pause =
                                dirty_poll_interval(nr_dirty, dirty_thresh);
-       } else if (pause <= max_pause / 4 &&
+       } else if (period <= max_pause / 4 &&
                   pages_dirtied >= current->nr_dirtied_pause) {
                current->nr_dirtied_pause = clamp_val(
                                        dirty_ratelimit * (max_pause / 2) / HZ,