Exercise 5

We will need to access pointer to block_device pointer in Linux to collect block device name and group data by it. As we know, main filesystem structure is called super_block which contains s_bdev pointer which has struct block_device*. SystemTap has two tapset functions, MINOR() and MAJOR() which allow to extract device number from bd_dev field of that structure. There is also an undocumented bdevname() function which is more convenient as it returns string, so we will use it.

We will attach probes to vfs_write() and vfs_read() functions to trace filesystem operations. First argument of that functions is pointer to file of type struct file*. Amount of data being written or read is passed through argument $count.

BIO level can be traced with ioblock tapset. We will do so by using its ioblock.request probe. It has following arguments: bdev –- block_device pointer, size –- amount of data in request, rw –- read or write flag which can be tested for equality with BIO_READ or BIO_WRITE constants.

Here is resulting deblock.stp script:

  Script file deblock.stp

global vfstp, biotp;

probe kernel.function("vfs_write") {
    file = $file;
    if(!file) next;
        
    sb = @cast(file, "file")->f_path->mnt->mnt_sb;
    if(!sb) next;
    
    bdev = @cast(sb, "super_block")->s_bdev;
    if(bdev)
        vfstp[bdev] <<< $count;
}

probe ioblock.request {    
    if(bio_rw_num(rw) != BIO_WRITE)
        next;
    
    biotp[bdev] <<< size;
}

probe timer.s(1) {
    printf("%12s %8s BDEV KB/s\n", "BDEV", "VFS KB/s");
    foreach([bdev] in vfstp) {
        printf("%12s %8d %d\n", bdevname(bdev),
                @sum(vfstp[bdev]) / 1024,
                @sum(biotp[bdev]) / 1024);
    }
    delete vfstp; delete biotp;
}

To trace readahead from part 2 we will need to replace vfs_write to vfs_read, BIO_WRITE to BIO_READ and get rid from amount of data in request saving into aggregation by replacing it with number of requests (which will be constant 1).

We can use scsi.ioentry probe to trace SCSI operations. We can actually detect which command was used by parsing CDB buffer, but we will omit that and will trace all SCSI operations. Getting device name, however is not that easy: request structure which is used in SCSI stack refers to gendisk and hd_struct structures, but they won't refer to block_device (on contrary, block_device itself refers them). So we will make a small trick: there is a linked list of structures bio ... biotail which refer block device structure the same way they do in BIO probes, so we will simply copy approach from ioblock.request probe.

We will get readahead.stp script after applying all these modifications:

  Script file readahead.stp

global vfsops, bioops, scsiops;

probe kernel.function("vfs_read") {
    file = $file;
    if(!file) next;
        
    sb = @cast(file, "file")->f_path->mnt->mnt_sb;
    if(!sb) next;
    
    bdev = @cast(sb, "super_block")->s_bdev;
    if(bdev)
        vfsops[bdev] <<< 1;
}

probe ioblock.request {    
    if(bio_rw_num(rw) != BIO_READ)
        next;
    
    bioops[bdev] <<< 1;
}

probe scsi.ioentry {
    bio = @cast(req_addr, "struct request")->bio;
    if(!bio) next;
    
    bdev = @cast(bio, "bio")->bi_bdev;
    if(bdev)
        scsiops[bdev] <<< 1;
}

probe timer.s(1) {
    printf("%12s %8s %8s SCSI OP/s\n", "BDEV", "VFS OP/s", "BDEV OP/s");
    foreach([bdev] in vfsops) {
        printf("%12s %8d %8d %d\n", bdevname(bdev), @count(vfsops[bdev]), 
               @count(bioops[bdev]), @count(scsiops[bdev]));
    }
    delete vfsops; delete bioops; delete scsiops;
}

One can get device name with ddi_pathname() action or devinfo_t translator (which uses it indirectly) which is applied to buf structure. Probes from io provider will do it automatically by passing resulting pseudo-structure as args[1] argument. Aside from name, it contains minor and major device names.

Getting device name on VFS layer, however is harder: vfs_t structure which describes filesystem has only device number vfs_dev. ZFS makes things even harder: there is intermediate layer called pool which hides block devices from filesystem layer. So we will use mountpoints as an aggregation key. We will trace filesystem operations by attaching to fop_read() and fop_write() which accept pointers to vnode_t of file as their first argument and pointer to uio structure as their second argument (it describes user request and thus contains amount of data).

Using all this we can get our deblock.d script:

  Script file deblock.d

#!/usr/sbin/dtrace -qCs

#define VFSMNTPT(vfs)   ((vfs)->vfs_vnodecovered            \
             ? stringof((vfs)->vfs_vnodecovered->v_path)    \
             : "???")
#define NBITSMINOR      32
#define MAXMIN          0xFFFFFFFF

fbt::fop_write:entry 
/args[1]->uio_resid != 0/ {
    this->dev = args[0]->v_vfsp->vfs_dev;
    @vfs[getmajor(this->dev), 
         getminor(this->dev),
         VFSMNTPT(args[0]->v_vfsp)] = sum(args[1]->uio_resid);
}

io:::start
/args[0]->b_bcount != 0 && args[0]->b_flags & B_WRITE/ {
    @bio[args[1]->dev_major,
         args[1]->dev_minor,
         args[1]->dev_statname] = sum(args[0]->b_bcount);
}

tick-1s {
    normalize(@vfs, 1024);  normalize(@bio, 1024);
    
    printf("%9s %16s %8s BDEV KB/s\n", "DEV_T", "NAME", "VFS KB/s");
    printa("%3d,%-5d %16s %8@u %@u\n", @vfs, @bio);
    
    trunc(@vfs); trunc(@bio);
}

We will trace SCSI stack by attaching to scsi-transport-dispatch probe which will receive pointer to buf as first argument. That is very similar to probes from io provider except that probe doesn't apply translators on buffer.

Other changes in readahead.d are similar to those that was done for SystemTap:

  Script file readahead.d

#!/usr/sbin/dtrace -qCs

#define VFSMNTPT(vfs)   ((vfs)->vfs_vnodecovered            \
             ? stringof((vfs)->vfs_vnodecovered->v_path)    \
             : "???")
#define HASDI(bp)       (((struct buf*) bp)->b_dip != 0)
#define DEVINFO(bp)     xlate((struct buf*) bp)

fbt::fop_read:entry 
/args[1]->uio_resid != 0/ {
    this->dev = args[0]->v_vfsp->vfs_dev;
    @vfs[getmajor(this->dev), 
         getminor(this->dev),
         VFSMNTPT(args[0]->v_vfsp)] = count();
}

io:::start
/args[0]->b_bcount != 0 && args[0]->b_flags & B_READ/ {
    @bio[args[1]->dev_major,
         args[1]->dev_minor,
         args[1]->dev_statname] = count();
}

scsi-transport-dispatch 
/arg0 != 0 && HASDI(arg0)/ {
    @scsi[DEVINFO(arg0)->dev_major,
          DEVINFO(arg0)->dev_minor,
          DEVINFO(arg0)->dev_statname] = count();
}

tick-1s {
    printf("%9s %16s %8s %8s SCSI OP/s\n", "DEV_T", "NAME", "VFS OP/s", "BDEV OP/s");
    printa("%3d,%-5d %16s %8@u %@8u %@u\n", @vfs, @bio, @scsi);
    
    trunc(@vfs); trunc(@bio); trunc(@scsi);
}