1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
|
/*
* fs/logfs/logfs_abi.h
*
* As should be obvious for Linux kernel code, license is GPLv2
*
* Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
*
* Public header for logfs.
*/
#ifndef FS_LOGFS_LOGFS_ABI_H
#define FS_LOGFS_LOGFS_ABI_H
/* For out-of-kernel compiles */
#ifndef BUILD_BUG_ON
#define BUILD_BUG_ON(condition) /**/
#endif
#define SIZE_CHECK(type, size) \
static inline void check_##type(void) \
{ \
BUILD_BUG_ON(sizeof(struct type) != (size)); \
}
/*
* Throughout the logfs code, we're constantly dealing with blocks at
* various positions or offsets. To remove confusion, we stricly
* distinguish between a "position" - the logical position within a
* file and an "offset" - the physical location within the device.
*
* Any usage of the term offset for a logical location or position for
* a physical one is a bug and should get fixed.
*/
/*
* Block are allocated in one of several segments depending on their
* level. The following levels are used:
* 0 - regular data block
* 1 - i1 indirect blocks
* 2 - i2 indirect blocks
* 3 - i3 indirect blocks
* 4 - i4 indirect blocks
* 5 - i5 indirect blocks
* 6 - ifile data blocks
* 7 - ifile i1 indirect blocks
* 8 - ifile i2 indirect blocks
* 9 - ifile i3 indirect blocks
* 10 - ifile i4 indirect blocks
* 11 - ifile i5 indirect blocks
* Potential levels to be used in the future:
* 12 - gc recycled blocks, long-lived data
* 13 - replacement blocks, short-lived data
*
* Levels 1-11 are necessary for robust gc operations and help separate
* short-lived metadata from longer-lived file data. In the future,
* file data should get separated into several segments based on simple
* heuristics. Old data recycled during gc operation is expected to be
* long-lived. New data is of uncertain life expectancy. New data
* used to replace older blocks in existing files is expected to be
* short-lived.
*/
/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */
#define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull
#define LOGFS_MAGIC_U32 0xc97e8168u
/*
* Various blocksize related macros. Blocksize is currently fixed at 4KiB.
* Sooner or later that should become configurable and the macros replaced
* by something superblock-dependent. Pointers in indirect blocks are and
* will remain 64bit.
*
* LOGFS_BLOCKSIZE - self-explaining
* LOGFS_BLOCK_FACTOR - number of pointers per indirect block
* LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts
*/
#define LOGFS_BLOCKSIZE (4096ull)
#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64))
#define LOGFS_BLOCK_BITS (9)
/*
* Number of blocks at various levels of indirection. There are 16 direct
* block pointers plus a single indirect pointer.
*/
#define I0_BLOCKS (16)
#define I1_BLOCKS LOGFS_BLOCK_FACTOR
#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS)
#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS)
#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS)
#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS)
#define INDIRECT_INDEX I0_BLOCKS
#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1)
/*
* Sizes at which files require another level of indirection. Files smaller
* than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
* similar like ext2 fast symlinks.
*
* Data at a position smaller than LOGFS_I0_SIZE is accessed through the
* direct pointers, else through the 1x indirect pointer and so forth.
*/
#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64))
#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE)
#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE)
#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE)
#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE)
#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE)
#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE)
/*
* Each indirect block pointer must have this flag set, if all block pointers
* behind it are set, i.e. there is no hole hidden in the shadow of this
* indirect block pointer.
*/
#define LOGFS_FULLY_POPULATED (1ULL << 63)
#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
/*
* LogFS needs to separate data into levels. Each level is defined as the
* maximal possible distance from the master inode (inode of the inode file).
* Data blocks reside on level 0, 1x indirect block on level 1, etc.
* Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
* This effort is necessary to guarantee garbage collection to always make
* progress.
*
* LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
* LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is
* the maximal number of levels for one file.
* LOGFS_NO_AREAS is twice that, as the inode file and regular files are
* effectively stacked on top of each other.
*/
#define LOGFS_MAX_INDIRECT (5)
#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1)
#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS)
/* Maximum size of filenames */
#define LOGFS_MAX_NAMELEN (255)
/* Number of segments in the primary journal. */
#define LOGFS_JOURNAL_SEGS (16)
/* Maximum number of free/erased/etc. segments in journal entries */
#define MAX_CACHED_SEGS (64)
/*
* LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
* LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
* its header,
* LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
* its segment header and the padded space at the end when no further objects
* fit.
*/
#define LOGFS_OBJECT_HEADERSIZE (0x1c)
#define LOGFS_SEGMENT_HEADERSIZE (0x18)
#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
#define LOGFS_SEGMENT_RESERVE \
(LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
/*
* Segment types:
* SEG_SUPER - Data or indirect block
* SEG_JOURNAL - Inode
* SEG_OSTORE - Dentry
*/
enum {
SEG_SUPER = 0x01,
SEG_JOURNAL = 0x02,
SEG_OSTORE = 0x03,
};
/**
* struct logfs_segment_header - per-segment header in the ostore
*
* @crc: crc32 of header (there is no data)
* @pad: unused, must be 0
* @type: segment type, see above
* @level: GC level for all objects in this segment
* @segno: segment number
* @ec: erase count for this segment
* @gec: global erase count at time of writing
*/
struct logfs_segment_header {
__be32 crc;
__be16 pad;
__u8 type;
__u8 level;
__be32 segno;
__be32 ec;
__be64 gec;
};
SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
#define LOGFS_FEATURES_INCOMPAT (0ull)
#define LOGFS_FEATURES_RO_COMPAT (0ull)
#define LOGFS_FEATURES_COMPAT (0ull)
/**
* struct logfs_disk_super - on-medium superblock
*
* @ds_magic: magic number, must equal LOGFS_MAGIC
* @ds_crc: crc32 of structure starting with the next field
* @ds_ifile_levels: maximum number of levels for ifile
* @ds_iblock_levels: maximum number of levels for regular files
* @ds_data_levels: number of separate levels for data
* @pad0: reserved, must be 0
* @ds_feature_incompat: incompatible filesystem features
* @ds_feature_ro_compat: read-only compatible filesystem features
* @ds_feature_compat: compatible filesystem features
* @ds_flags: flags
* @ds_segment_shift: log2 of segment size
* @ds_block_shift: log2 of block size
* @ds_write_shift: log2 of write size
* @pad1: reserved, must be 0
* @ds_journal_seg: segments used by primary journal
* @ds_root_reserve: bytes reserved for the superuser
* @ds_speed_reserve: bytes reserved to speed up GC
* @ds_bad_seg_reserve: number of segments reserved to handle bad blocks
* @pad2: reserved, must be 0
* @pad3: reserved, must be 0
*
* Contains only read-only fields. Read-write fields like the amount of used
* space is tracked in the dynamic superblock, which is stored in the journal.
*/
struct logfs_disk_super {
struct logfs_segment_header ds_sh;
__be64 ds_magic;
__be32 ds_crc;
__u8 ds_ifile_levels;
__u8 ds_iblock_levels;
__u8 ds_data_levels;
__u8 ds_segment_shift;
__u8 ds_block_shift;
__u8 ds_write_shift;
__u8 pad0[6];
__be64 ds_filesystem_size;
__be32 ds_segment_size;
__be32 ds_bad_seg_reserve;
__be64 ds_feature_incompat;
__be64 ds_feature_ro_compat;
__be64 ds_feature_compat;
__be64 ds_feature_flags;
__be64 ds_root_reserve;
__be64 ds_speed_reserve;
__be32 ds_journal_seg[LOGFS_JOURNAL_SEGS];
__be64 ds_super_ofs[2];
__be64 pad3[8];
};
SIZE_CHECK(logfs_disk_super, 256);
/*
* Object types:
* OBJ_BLOCK - Data or indirect block
* OBJ_INODE - Inode
* OBJ_DENTRY - Dentry
*/
enum {
OBJ_BLOCK = 0x04,
OBJ_INODE = 0x05,
OBJ_DENTRY = 0x06,
};
/**
* struct logfs_object_header - per-object header in the ostore
*
* @crc: crc32 of header, excluding data_crc
* @len: length of data
* @type: object type, see above
* @compr: compression type
* @ino: inode number
* @bix: block index
* @data_crc: crc32 of payload
*/
struct logfs_object_header {
__be32 crc;
__be16 len;
__u8 type;
__u8 compr;
__be64 ino;
__be64 bix;
__be32 data_crc;
} __attribute__((packed));
SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
/*
* Reserved inode numbers:
* LOGFS_INO_MASTER - master inode (for inode file)
* LOGFS_INO_ROOT - root directory
* LOGFS_INO_SEGFILE - per-segment used bytes and erase count
*/
enum {
LOGFS_INO_MAPPING = 0x00,
LOGFS_INO_MASTER = 0x01,
LOGFS_INO_ROOT = 0x02,
LOGFS_INO_SEGFILE = 0x03,
LOGFS_RESERVED_INOS = 0x10,
};
/*
* Inode flags. High bits should never be written to the medium. They are
* reserved for in-memory usage.
* Low bits should either remain in sync with the corresponding FS_*_FL or
* reuse slots that obviously don't make sense for logfs.
*
* LOGFS_IF_DIRTY Inode must be written back
* LOGFS_IF_ZOMBIE Inode has been deleted
* LOGFS_IF_STILLBORN -ENOSPC happened when creating inode
*/
#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */
#define LOGFS_IF_DIRTY 0x20000000
#define LOGFS_IF_ZOMBIE 0x40000000
#define LOGFS_IF_STILLBORN 0x80000000
/* Flags available to chattr */
#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED)
#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
/* Flags inherited from parent directory on file/directory creation */
#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED)
/**
* struct logfs_disk_inode - on-medium inode
*
* @di_mode: file mode
* @di_pad: reserved, must be 0
* @di_flags: inode flags, see above
* @di_uid: user id
* @di_gid: group id
* @di_ctime: change time
* @di_mtime: modify time
* @di_refcount: reference count (aka nlink or link count)
* @di_generation: inode generation, for nfs
* @di_used_bytes: number of bytes used
* @di_size: file size
* @di_data: data pointers
*/
struct logfs_disk_inode {
__be16 di_mode;
__u8 di_height;
__u8 di_pad;
__be32 di_flags;
__be32 di_uid;
__be32 di_gid;
__be64 di_ctime;
__be64 di_mtime;
__be64 di_atime;
__be32 di_refcount;
__be32 di_generation;
__be64 di_used_bytes;
__be64 di_size;
__be64 di_data[LOGFS_EMBEDDED_FIELDS];
};
SIZE_CHECK(logfs_disk_inode, 200);
#define INODE_POINTER_OFS \
(offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
#define INODE_USED_OFS \
(offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
#define INODE_SIZE_OFS \
(offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
#define INODE_HEIGHT_OFS (0)
/**
* struct logfs_disk_dentry - on-medium dentry structure
*
* @ino: inode number
* @namelen: length of file name
* @type: file type, identical to bits 12..15 of mode
* @name: file name
*/
/* FIXME: add 6 bytes of padding to remove the __packed */
struct logfs_disk_dentry {
__be64 ino;
__be16 namelen;
__u8 type;
__u8 name[LOGFS_MAX_NAMELEN];
} __attribute__((packed));
SIZE_CHECK(logfs_disk_dentry, 266);
#define RESERVED 0xffffffff
#define BADSEG 0xffffffff
/**
* struct logfs_segment_entry - segment file entry
*
* @ec_level: erase count and level
* @valid: number of valid bytes
*
* Segment file contains one entry for every segment. ec_level contains the
* erasecount in the upper 28 bits and the level in the lower 4 bits. An
* ec_level of BADSEG (-1) identifies bad segments. valid contains the number
* of valid bytes or RESERVED (-1 again) if the segment is used for either the
* superblock or the journal, or when the segment is bad.
*/
struct logfs_segment_entry {
__be32 ec_level;
__be32 valid;
};
SIZE_CHECK(logfs_segment_entry, 8);
/**
* struct logfs_journal_header - header for journal entries (JEs)
*
* @h_crc: crc32 of journal entry
* @h_len: length of compressed journal entry,
* not including header
* @h_datalen: length of uncompressed data
* @h_type: JE type
* @h_compr: compression type
* @h_pad: reserved
*/
struct logfs_journal_header {
__be32 h_crc;
__be16 h_len;
__be16 h_datalen;
__be16 h_type;
__u8 h_compr;
__u8 h_pad[5];
};
SIZE_CHECK(logfs_journal_header, 16);
/*
* Life expectency of data.
* VIM_DEFAULT - default vim
* VIM_SEGFILE - for segment file only - very short-living
* VIM_GC - GC'd data - likely long-living
*/
enum logfs_vim {
VIM_DEFAULT = 0,
VIM_SEGFILE = 1,
};
/**
* struct logfs_je_area - wbuf header
*
* @segno: segment number of area
* @used_bytes: number of bytes already used
* @gc_level: GC level
* @vim: life expectancy of data
*
* "Areas" are segments currently being used for writing. There is at least
* one area per GC level. Several may be used to separate long-living from
* short-living data. If an area with unknown vim is encountered, it can
* simply be closed.
* The write buffer immediately follow this header.
*/
struct logfs_je_area {
__be32 segno;
__be32 used_bytes;
__u8 gc_level;
__u8 vim;
} __attribute__((packed));
SIZE_CHECK(logfs_je_area, 10);
#define MAX_JOURNAL_HEADER \
(sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
/**
* struct logfs_je_dynsb - dynamic superblock
*
* @ds_gec: global erase count
* @ds_sweeper: current position of GC "sweeper"
* @ds_rename_dir: source directory ino (see dir.c documentation)
* @ds_rename_pos: position of source dd (see dir.c documentation)
* @ds_victim_ino: victims of incomplete dir operation (see dir.c)
* @ds_victim_ino: parent inode of victim (see dir.c)
* @ds_used_bytes: number of used bytes
*/
struct logfs_je_dynsb {
__be64 ds_gec;
__be64 ds_sweeper;
__be64 ds_rename_dir;
__be64 ds_rename_pos;
__be64 ds_victim_ino;
__be64 ds_victim_parent; /* XXX */
__be64
|