root/trunk/server/slabs.c @ 724

Revision 724, 12.5 kB (checked in by dormando, 21 months ago)

Enable use of large memory pages (Trond Norbye) <Trond.Norbye@…>

Initial support for solaris.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
Line 
1/* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/*
3 * Slabs memory allocation, based on powers-of-N. Slabs are up to 1MB in size
4 * and are divided into chunks. The chunk sizes start off at the size of the
5 * "item" structure plus space for a small key and value. They increase by
6 * a multiplier factor from there, up to half the maximum slab size. The last
7 * slab size is always 1MB, since that's the maximum item size allowed by the
8 * memcached protocol.
9 *
10 * $Id$
11 */
12#include "memcached.h"
13#include <sys/stat.h>
14#include <sys/socket.h>
15#include <sys/signal.h>
16#include <sys/resource.h>
17#include <fcntl.h>
18#include <netinet/in.h>
19#include <errno.h>
20#include <stdlib.h>
21#include <stdio.h>
22#include <string.h>
23#include <assert.h>
24
25#define POWER_SMALLEST 1
26#define POWER_LARGEST  200
27#define POWER_BLOCK 1048576
28#define CHUNK_ALIGN_BYTES 8
29#define DONT_PREALLOC_SLABS
30
31/* powers-of-N allocation structures */
32
33typedef struct {
34    unsigned int size;      /* sizes of items */
35    unsigned int perslab;   /* how many items per slab */
36
37    void **slots;           /* list of item ptrs */
38    unsigned int sl_total;  /* size of previous array */
39    unsigned int sl_curr;   /* first free slot */
40
41    void *end_page_ptr;         /* pointer to next free item at end of page, or 0 */
42    unsigned int end_page_free; /* number of items remaining at end of last alloced page */
43
44    unsigned int slabs;     /* how many slabs were allocated for this class */
45
46    void **slab_list;       /* array of slab pointers */
47    unsigned int list_size; /* size of prev array */
48
49    unsigned int killing;  /* index+1 of dying slab, or zero if none */
50} slabclass_t;
51
52static slabclass_t slabclass[POWER_LARGEST + 1];
53static size_t mem_limit = 0;
54static size_t mem_malloced = 0;
55static int power_largest;
56
57static void *mem_base = NULL;
58static void *mem_current = NULL;
59static size_t mem_avail = 0;
60
61/*
62 * Forward Declarations
63 */
64static int do_slabs_newslab(const unsigned int id);
65static void *memory_allocate(size_t size);
66
67#ifndef DONT_PREALLOC_SLABS
68/* Preallocate as many slab pages as possible (called from slabs_init)
69   on start-up, so users don't get confused out-of-memory errors when
70   they do have free (in-slab) space, but no space to make new slabs.
71   if maxslabs is 18 (POWER_LARGEST - POWER_SMALLEST + 1), then all
72   slab types can be made.  if max memory is less than 18 MB, only the
73   smaller ones will be made.  */
74static void slabs_preallocate (const unsigned int maxslabs);
75#endif
76
77/*
78 * Figures out which slab class (chunk size) is required to store an item of
79 * a given size.
80 *
81 * Given object size, return id to use when allocating/freeing memory for object
82 * 0 means error: can't store such a large object
83 */
84
85unsigned int slabs_clsid(const size_t size) {
86    int res = POWER_SMALLEST;
87
88    if (size == 0)
89        return 0;
90    while (size > slabclass[res].size)
91        if (res++ == power_largest)     /* won't fit in the biggest slab */
92            return 0;
93    return res;
94}
95
96/**
97 * Determines the chunk sizes and initializes the slab class descriptors
98 * accordingly.
99 */
100void slabs_init(const size_t limit, const double factor, const bool prealloc) {
101    int i = POWER_SMALLEST - 1;
102    unsigned int size = sizeof(item) + settings.chunk_size;
103
104    /* Factor of 2.0 means use the default memcached behavior */
105    if (factor == 2.0 && size < 128)
106        size = 128;
107
108    mem_limit = limit;
109
110    if (prealloc) {
111        /* Allocate everything in a big chunk with malloc */
112        mem_base = malloc(mem_limit);
113        if (mem_base != NULL) {
114            mem_current = mem_base;
115            mem_avail = mem_limit;
116        } else {
117            fprintf(stderr, "Warning: Failed to allocate requested memory in"
118                    " one large chunk.\nWill allocate in smaller chunks\n");
119        }
120    }
121
122    memset(slabclass, 0, sizeof(slabclass));
123
124    while (++i < POWER_LARGEST && size <= POWER_BLOCK / 2) {
125        /* Make sure items are always n-byte aligned */
126        if (size % CHUNK_ALIGN_BYTES)
127            size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
128
129        slabclass[i].size = size;
130        slabclass[i].perslab = POWER_BLOCK / slabclass[i].size;
131        size *= factor;
132        if (settings.verbose > 1) {
133            fprintf(stderr, "slab class %3d: chunk size %6u perslab %5u\n",
134                    i, slabclass[i].size, slabclass[i].perslab);
135        }
136    }
137
138    power_largest = i;
139    slabclass[power_largest].size = POWER_BLOCK;
140    slabclass[power_largest].perslab = 1;
141
142    /* for the test suite:  faking of how much we've already malloc'd */
143    {
144        char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
145        if (t_initial_malloc) {
146            mem_malloced = (size_t)atol(t_initial_malloc);
147        }
148
149    }
150
151#ifndef DONT_PREALLOC_SLABS
152    {
153        char *pre_alloc = getenv("T_MEMD_SLABS_ALLOC");
154
155        if (pre_alloc == NULL || atoi(pre_alloc) != 0) {
156            slabs_preallocate(power_largest);
157        }
158    }
159#endif
160}
161
162#ifndef DONT_PREALLOC_SLABS
163static void slabs_preallocate (const unsigned int maxslabs) {
164    int i;
165    unsigned int prealloc = 0;
166
167    /* pre-allocate a 1MB slab in every size class so people don't get
168       confused by non-intuitive "SERVER_ERROR out of memory"
169       messages.  this is the most common question on the mailing
170       list.  if you really don't want this, you can rebuild without
171       these three lines.  */
172
173    for (i = POWER_SMALLEST; i <= POWER_LARGEST; i++) {
174        if (++prealloc > maxslabs)
175            return;
176        do_slabs_newslab(i);
177    }
178
179}
180#endif
181
182static int grow_slab_list (const unsigned int id) {
183    slabclass_t *p = &slabclass[id];
184    if (p->slabs == p->list_size) {
185        size_t new_size =  (p->list_size != 0) ? p->list_size * 2 : 16;
186        void *new_list = realloc(p->slab_list, new_size * sizeof(void *));
187        if (new_list == 0) return 0;
188        p->list_size = new_size;
189        p->slab_list = new_list;
190    }
191    return 1;
192}
193
194static int do_slabs_newslab(const unsigned int id) {
195    slabclass_t *p = &slabclass[id];
196#ifdef ALLOW_SLABS_REASSIGN
197    int len = POWER_BLOCK;
198#else
199    int len = p->size * p->perslab;
200#endif
201    char *ptr;
202
203    if (mem_limit && mem_malloced + len > mem_limit && p->slabs > 0)
204        return 0;
205
206    if (grow_slab_list(id) == 0) return 0;
207
208    ptr = memory_allocate((size_t)len);
209    if (ptr == 0) return 0;
210
211    memset(ptr, 0, (size_t)len);
212    p->end_page_ptr = ptr;
213    p->end_page_free = p->perslab;
214
215    p->slab_list[p->slabs++] = ptr;
216    mem_malloced += len;
217    return 1;
218}
219
220/*@null@*/
221void *do_slabs_alloc(const size_t size) {
222    slabclass_t *p;
223
224    unsigned int id = slabs_clsid(size);
225    if (id < POWER_SMALLEST || id > power_largest)
226        return NULL;
227
228    p = &slabclass[id];
229    assert(p->sl_curr == 0 || ((item *)p->slots[p->sl_curr - 1])->slabs_clsid == 0);
230
231#ifdef USE_SYSTEM_MALLOC
232    if (mem_limit && mem_malloced + size > mem_limit)
233        return 0;
234    mem_malloced += size;
235    return malloc(size);
236#endif
237
238    /* fail unless we have space at the end of a recently allocated page,
239       we have something on our freelist, or we could allocate a new page */
240    if (! (p->end_page_ptr != 0 || p->sl_curr != 0 || do_slabs_newslab(id) != 0))
241        return 0;
242
243    /* return off our freelist, if we have one */
244    if (p->sl_curr != 0)
245        return p->slots[--p->sl_curr];
246
247    /* if we recently allocated a whole page, return from that */
248    if (p->end_page_ptr) {
249        void *ptr = p->end_page_ptr;
250        if (--p->end_page_free != 0) {
251            p->end_page_ptr += p->size;
252        } else {
253            p->end_page_ptr = 0;
254        }
255        return ptr;
256    }
257
258    return NULL;  /* shouldn't ever get here */
259}
260
261void do_slabs_free(void *ptr, const size_t size) {
262    unsigned char id = slabs_clsid(size);
263    slabclass_t *p;
264
265    assert(((item *)ptr)->slabs_clsid == 0);
266    assert(id >= POWER_SMALLEST && id <= power_largest);
267    if (id < POWER_SMALLEST || id > power_largest)
268        return;
269
270    p = &slabclass[id];
271
272#ifdef USE_SYSTEM_MALLOC
273    mem_malloced -= size;
274    free(ptr);
275    return;
276#endif
277
278    if (p->sl_curr == p->sl_total) { /* need more space on the free list */
279        int new_size = (p->sl_total != 0) ? p->sl_total * 2 : 16;  /* 16 is arbitrary */
280        void **new_slots = realloc(p->slots, new_size * sizeof(void *));
281        if (new_slots == 0)
282            return;
283        p->slots = new_slots;
284        p->sl_total = new_size;
285    }
286    p->slots[p->sl_curr++] = ptr;
287    return;
288}
289
290/*@null@*/
291char* do_slabs_stats(int *buflen) {
292    int i, total;
293    char *buf = (char *)malloc(power_largest * 200 + 100);
294    char *bufcurr = buf;
295
296    *buflen = 0;
297    if (buf == NULL) return NULL;
298
299    total = 0;
300    for(i = POWER_SMALLEST; i <= power_largest; i++) {
301        slabclass_t *p = &slabclass[i];
302        if (p->slabs != 0) {
303            unsigned int perslab, slabs;
304
305            slabs = p->slabs;
306            perslab = p->perslab;
307
308            bufcurr += sprintf(bufcurr, "STAT %d:chunk_size %u\r\n", i, p->size);
309            bufcurr += sprintf(bufcurr, "STAT %d:chunks_per_page %u\r\n", i, perslab);
310            bufcurr += sprintf(bufcurr, "STAT %d:total_pages %u\r\n", i, slabs);
311            bufcurr += sprintf(bufcurr, "STAT %d:total_chunks %u\r\n", i, slabs*perslab);
312            bufcurr += sprintf(bufcurr, "STAT %d:used_chunks %u\r\n", i, slabs*perslab - p->sl_curr);
313            bufcurr += sprintf(bufcurr, "STAT %d:free_chunks %u\r\n", i, p->sl_curr);
314            bufcurr += sprintf(bufcurr, "STAT %d:free_chunks_end %u\r\n", i, p->end_page_free);
315            total++;
316        }
317    }
318    bufcurr += sprintf(bufcurr, "STAT active_slabs %d\r\nSTAT total_malloced %llu\r\n", total, (unsigned long long)mem_malloced);
319    bufcurr += sprintf(bufcurr, "END\r\n");
320    *buflen = bufcurr - buf;
321    return buf;
322}
323
324#ifdef ALLOW_SLABS_REASSIGN
325/* Blows away all the items in a slab class and moves its slabs to another
326   class. This is only used by the "slabs reassign" command, for manual tweaking
327   of memory allocation. It's disabled by default since it requires that all
328   slabs be the same size (which can waste space for chunk size mantissas of
329   other than 2.0).
330   1 = success
331   0 = fail
332   -1 = tried. busy. send again shortly. */
333int do_slabs_reassign(unsigned char srcid, unsigned char dstid) {
334    void *slab, *slab_end;
335    slabclass_t *p, *dp;
336    void *iter;
337    bool was_busy = false;
338
339    if (srcid < POWER_SMALLEST || srcid > power_largest ||
340        dstid < POWER_SMALLEST || dstid > power_largest)
341        return 0;
342
343    p = &slabclass[srcid];
344    dp = &slabclass[dstid];
345
346    /* fail if src still populating, or no slab to give up in src */
347    if (p->end_page_ptr || ! p->slabs)
348        return 0;
349
350    /* fail if dst is still growing or we can't make room to hold its new one */
351    if (dp->end_page_ptr || ! grow_slab_list(dstid))
352        return 0;
353
354    if (p->killing == 0) p->killing = 1;
355
356    slab = p->slab_list[p->killing - 1];
357    slab_end = (char*)slab + POWER_BLOCK;
358
359    for (iter = slab; iter < slab_end; (char*)iter += p->size) {
360        item *it = (item *)iter;
361        if (it->slabs_clsid) {
362            if (it->refcount) was_busy = true;
363            item_unlink(it);
364        }
365    }
366
367    /* go through free list and discard items that are no longer part of this slab */
368    {
369        int fi;
370        for (fi = p->sl_curr - 1; fi >= 0; fi--) {
371            if (p->slots[fi] >= slab && p->slots[fi] < slab_end) {
372                p->sl_curr--;
373                if (p->sl_curr > fi) p->slots[fi] = p->slots[p->sl_curr];
374            }
375        }
376    }
377
378    if (was_busy) return -1;
379
380    /* if good, now move it to the dst slab class */
381    p->slab_list[p->killing - 1] = p->slab_list[p->slabs - 1];
382    p->slabs--;
383    p->killing = 0;
384    dp->slab_list[dp->slabs++] = slab;
385    dp->end_page_ptr = slab;
386    dp->end_page_free = dp->perslab;
387    /* this isn't too critical, but other parts of the code do asserts to
388       make sure this field is always 0.  */
389    for (iter = slab; iter < slab_end; (char*)iter += dp->size) {
390        ((item *)iter)->slabs_clsid = 0;
391    }
392    return 1;
393}
394#endif
395
396static void *memory_allocate(size_t size) {
397    void *ret;
398
399    if (mem_base == NULL) {
400        /* We are not using a preallocated large memory chunk */
401        ret = malloc(size);
402    } else {
403        ret = mem_current;
404
405        if (size > mem_avail) {
406            return NULL;
407        }
408
409        /* mem_current pointer _must_ be aligned!!! */
410        if (size % CHUNK_ALIGN_BYTES) {
411            size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
412        }
413
414        mem_current += size;
415        if (size < mem_avail) {
416            mem_avail -= size;
417        } else {
418            mem_avail = 0;
419        }
420    }
421
422    return ret;
423}
Note: See TracBrowser for help on using the browser.