diff --git a/mon/make-osd-hybrid.js b/mon/make-osd-hybrid.js new file mode 100755 index 000000000..487f80df9 --- /dev/null +++ b/mon/make-osd-hybrid.js @@ -0,0 +1,414 @@ +#!/usr/bin/nodejs +// systemd unit generator for hybrid (HDD+SSD) vitastor OSDs +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.1 + +// USAGE: nodejs make-osd-hybrid.js [--disable_ssd_cache 0] [--disable_hdd_cache 0] /dev/sda /dev/sdb /dev/sdc /dev/sdd ... +// I.e. - just pass all HDDs and SSDs mixed, the script will decide where +// to put journals on its own + +const fs = require('fs'); +const fsp = fs.promises; +const child_process = require('child_process'); + +const options = { + debug: 1, + journal_size: 1024*1024*1024, + min_meta_size: 1024*1024*1024, + object_size: 1024*1024, + bitmap_granularity: 4096, + device_block_size: 4096, + disable_ssd_cache: 1, + disable_hdd_cache: 1, +}; + +run().catch(console.fatal); + +async function run() +{ + const device_list = parse_options(); + await system_or_die("mkdir -p /var/log/vitastor; chown vitastor /var/log/vitastor"); + // Collect devices + const all_devices = await collect_devices(device_list); + const ssds = all_devices.filter(d => d.ssd); + const hdds = all_devices.filter(d => !d.ssd); + // Collect existing OSD units + const osd_units = await collect_osd_units(); + // Count assigned HDD journals and unallocated space for each SSD + await check_journal_count(ssds, osd_units); + // Create new OSDs + await create_new_hybrid_osds(hdds, ssds, osd_units); + process.exit(0); +} + +function parse_options() +{ + const devices = []; + const opt = {}; + for (let i = 2; i < process.argv.length; i++) + { + const arg = process.argv[i]; + if (arg == '--help' || arg == '-h') + { + opt.help = true; + break; + } + else if (arg.substr(0, 2) == '--') + opt[arg.substr(2)] = process.argv[++i]; + else + devices.push(arg); + } + if (opt.help || !devices.length) + { + console.log( + 'Prepare hybrid (HDD+SSD) Vitastor OSDs\n'+ + '(c) Vitaliy Filippov, 2019+, license: VNPL-1.1\n\n'+ + 'USAGE: nodejs make-osd-hybrid.js [OPTIONS] /dev/sda /dev/sdb /dev/sdc ...\n'+ + 'Just pass all your SSDs and HDDs in any order, the script will distribute OSDs for you.\n\n'+ + 'OPTIONS (with defaults):\n'+ + Object.keys(options).map(k => ` --${k} ${options[k]}`).join('\n') + ); + process.exit(0); + } + for (const k in opt) + options[k] = opt[k]; + return devices; +} + +// Collect devices +async function collect_devices(devices_to_check) +{ + const devices = []; + for (const dev of devices_to_check) + { + if (dev.substr(0, 5) != '/dev/') + { + console.log(`${dev} does not start with /dev/, skipping`); + continue; + } + if (!await file_exists('/sys/block/'+dev.substr(5))) + { + console.log(`${dev} is a partition, skipping`); + continue; + } + // Check if the device is an SSD + const rot = '/sys/block/'+dev.substr(5)+'/queue/rotational'; + if (!await file_exists(rot)) + { + console.log(`${dev} does not have ${rot} to check whether it's an SSD, skipping`); + continue; + } + const ssd = !parseInt(await fsp.readFile(rot, { encoding: 'utf-8' })); + // Check if the device has partition table + let [ has_partition_table, parts ] = await system(`sfdisk --dump ${dev} --json`); + if (has_partition_table != 0) + { + // Check if the device has any data + const [ has_data, out ] = await system(`blkid ${dev}`); + if (has_data == 0) + { + console.log(`${dev} contains data, skipping:\n ${out.trim().replace(/\n/g, '\n ')}`); + continue; + } + } + parts = parts ? JSON.parse(parts).partitiontable : null; + if (parts && parts.label != 'gpt') + { + console.log(`${dev} contains "${parts.label}" partition table, only GPT is supported, skipping`); + continue; + } + devices.push({ + path: dev, + ssd, + parts, + }); + } + return devices; +} + +// Collect existing OSD units +async function collect_osd_units() +{ + const units = []; + for (const unit of (await system("ls /etc/systemd/system/vitastor-osd*.service"))[1].trim().split('\n')) + { + if (!unit) + { + continue; + } + let cmd = /^ExecStart\s*=\s*(([^\n]*\\\n)*[^\n]*)/.exec(await fsp.readFile(unit, { encoding: 'utf-8' })); + if (!cmd) + { + console.log('ExecStart= not found in '+unit+', skipping') + continue; + } + let kv = {}, key; + cmd = cmd[1].replace(/^bash\s+-c\s+'/, '') + .replace(/>>\s*\S+2>\s*&1\s*'$/, '') + .replace(/\s*\\\n\s*/g, ' ') + .replace(/([^\s']+)|'([^']+)'/g, (m, m1, m2) => + { + m1 = m1||m2; + if (key == null) + { + if (m1.substr(0, 2) != '--') + { + console.log('Strange command line in '+unit+', stopping'); + process.exit(1); + } + key = m1.substr(2); + } + else + { + kv[key] = m1; + key = null; + } + }); + units.push(kv); + } + return units; +} + +// Count assigned HDD journals and unallocated space for each SSD +async function check_journal_count(ssds, osd_units) +{ + const units_by_journal = osd_units.reduce((a, c) => + { + if (c.journal_device) + a[c.journal_device] = c; + return a; + }, {}); + for (const dev of ssds) + { + dev.journals = 0; + if (dev.parts) + { + for (const part of dev.parts.partitions) + { + if (part.uuid && units_by_journal['/dev/disk/by-partuuid/'+part.uuid.toLowerCase()]) + { + dev.journals++; + } + } + dev.free = free_from_parttable(dev.parts); + } + else + { + dev.free = parseInt(await system_or_die("blockdev --getsize64 "+dev.path)); + } + } +} + +async function create_new_hybrid_osds(hdds, ssds, osd_units) +{ + const units_by_disk = osd_units.reduce((a, c) => { a[c.data_device] = c; return a; }, {}); + for (const dev of hdds) + { + if (!dev.parts) + { + // HDD is not partitioned yet, create a single partition + // + is the "default value" for sfdisk + await system_or_die('sfdisk '+dev.path, 'label: gpt\n\n+ +\n'); + dev.parts = JSON.parse(await system_or_die('sfdisk --dump '+dev.path+' --json')).partitiontable; + } + if (dev.parts.partitions.length != 1) + { + console.log(dev.path+' has more than 1 partition, skipping'); + } + else if ((dev.parts.partitions[0].start + dev.parts.partitions[0].size) != (1 + dev.parts.lastlba)) + { + console.log(dev.path+'1 is not a whole-disk partition, skipping'); + } + else if (!dev.parts.partitions[0].uuid) + { + console.log(dev.parts.partitions[0].node+' does not have UUID. Please repartition '+dev.path+' with GPT'); + } + else if (!units_by_disk['/dev/disk/by-partuuid/'+dev.parts.partitions[0].uuid.toLowerCase()]) + { + await create_hybrid_osd(dev, ssds); + } + } +} + +async function create_hybrid_osd(dev, ssds) +{ + // Create a new OSD + // Calculate metadata size + const data_device = '/dev/disk/by-partuuid/'+dev.parts.partitions[0].uuid.toLowerCase(); + const data_size = dev.parts.partitions[0].size * dev.parts.sectorsize; + const meta_entry_size = 24 + 2*options.object_size/options.bitmap_granularity/8; + const entries_per_block = Math.floor(options.device_block_size / meta_entry_size); + const object_count = Math.floor(data_size / options.object_size); + let meta_size = Math.ceil(1 + object_count / entries_per_block) * options.device_block_size; + // Leave some extra space for future metadata formats and round metadata area size to multiples of 1 MB + meta_size = 2*meta_size; + meta_size = Math.ceil(meta_size/1024/1024) * 1024*1024; + if (meta_size < options.min_meta_size) + meta_size = options.min_meta_size; + let journal_size = Math.ceil(options.journal_size/1024/1024) * 1024*1024; + // Pick an SSD for journal, balancing the number of journals across SSDs + let selected_ssd; + for (const ssd of ssds) + if (ssd.free >= (meta_size+journal_size) && (!selected_ssd || selected_ssd.journals > ssd.journals)) + selected_ssd = ssd; + if (!selected_ssd) + { + console.error('Could not find free space for SSD journal and metadata for '+dev.path); + process.exit(1); + } + // Allocate an OSD number + const osd_num = (await system_or_die("vitastor-cli alloc-osd")).trim(); + if (!osd_num) + { + console.error('Failed to run vitastor-cli alloc-osd'); + process.exit(1); + } + console.log('Creating OSD '+osd_num+' on '+dev.path+' (HDD) with journal and metadata on '+selected_ssd.path+' (SSD)'); + // Add two partitions: journal and metadata + const new_parts = await add_partitions(selected_ssd, [ journal_size, meta_size ]); + selected_ssd.journals++; + const journal_device = '/dev/disk/by-partuuid/'+new_parts[0].uuid.toLowerCase(); + const meta_device = '/dev/disk/by-partuuid/'+new_parts[1].uuid.toLowerCase(); + // Wait until the device symlinks appear + while (!await file_exists(journal_device)) + { + await new Promise(ok => setTimeout(ok, 100)); + } + while (!await file_exists(meta_device)) + { + await new Promise(ok => setTimeout(ok, 100)); + } + // Zero out metadata and journal + await system_or_die("dd if=/dev/zero of="+journal_device+" bs=1M count="+(journal_size/1024/1024)+" oflag=direct"); + await system_or_die("dd if=/dev/zero of="+meta_device+" bs=1M count="+(meta_size/1024/1024)+" oflag=direct"); + // Create unit file for the OSD + const has_scsi_cache_type = options.disable_ssd_cache && + (await system("ls /sys/block/"+selected_ssd.path.substr(5)+"/device/scsi_disk/*/cache_type"))[0] == 0; + const write_through = options.disable_ssd_cache && ( + has_scsi_cache_type || selected_ssd.path.substr(5, 4) == 'nvme' + && (await system_or_die("/sys/block/"+selected_ssd.path.substr(5)+"/queue/write_cache")).trim() == "write through"); + await fsp.writeFile('/etc/systemd/system/vitastor-osd'+osd_num+'.service', +`[Unit] +Description=Vitastor object storage daemon osd.${osd_num} +After=network-online.target local-fs.target time-sync.target +Wants=network-online.target local-fs.target time-sync.target +PartOf=vitastor.target + +[Service] +LimitNOFILE=1048576 +LimitNPROC=1048576 +LimitMEMLOCK=infinity +ExecStart=bash -c '/usr/bin/vitastor-osd \\ + --osd_num ${osd_num} ${write_through + ? "--disable_meta_fsync 1 --disable_journal_fsync 1 --immediate_commit "+(options.disable_hdd_cache ? "all" : "small") + : ""} \\ + --throttle_small_writes 1 \\ + --disk_alignment ${options.device_block_size} \\ + --journal_block_size ${options.device_block_size} \\ + --meta_block_size ${options.device_block_size} \\ + --journal_no_same_sector_overwrites true \\ + --journal_sector_buffer_count 1024 \\ + --block_size ${options.object_size} \\ + --data_device ${data_device} \\ + --journal_device ${journal_device} \\ + --meta_device ${meta_device} >>/var/log/vitastor/osd${osd_num}.log 2>&1' +WorkingDirectory=/ +ExecStartPre=+chown vitastor:vitastor ${data_device} +ExecStartPre=+chown vitastor:vitastor ${journal_device} +ExecStartPre=+chown vitastor:vitastor ${meta_device}${ + has_scsi_cache_type + ? "\nExecStartPre=+bash -c 'D=$$$(readlink "+journal_device+"); echo write through > $$$(dirname /sys/block/*/$$\${D##*/})/device/scsi_disk/*/cache_type'" + : ""}${ + options.disable_hdd_cache + ? "\nExecStartPre=+bash -c 'D=$$$(readlink "+data_device+"); echo write through > $$$(dirname /sys/block/*/$$\${D##*/})/device/scsi_disk/*/cache_type'" + : ""} +User=vitastor +PrivateTmp=false +TasksMax=infinity +Restart=always +StartLimitInterval=0 +RestartSec=10 + +[Install] +WantedBy=vitastor.target +`); + await system_or_die("systemctl enable vitastor-osd"+osd_num); +} + +async function add_partitions(dev, sizes) +{ + let script = 'label: gpt\n\n'; + if (dev.parts) + { + // Old partitions + for (const part of dev.parts.partitions) + { + script += part.node+': '+Object.keys(part).map(k => k == 'node' ? '' : k+'='+part[k]).filter(k => k).join(', ')+'\n'; + } + } + // New partitions + for (const size of sizes) + { + script += '+ '+Math.ceil(size/1024)+'KiB\n'; + } + await system_or_die('sfdisk '+dev.path, script); + // Get new partition table and find the new partition + const newpt = JSON.parse(await system_or_die('sfdisk --dump '+dev.path+' --json')).partitiontable; + const old_nodes = dev.parts ? dev.parts.partitions.reduce((a, c) => { a[c.uuid] = true; return a; }, {}) : {}; + const new_nodes = newpt.partitions.filter(part => !old_nodes[part.uuid]); + if (new_nodes.length != sizes.length) + { + console.error('Failed to partition '+dev.path+': new partitions not found in table'); + process.exit(1); + } + dev.parts = newpt; + dev.free = free_from_parttable(newpt); + return new_nodes; +} + +function free_from_parttable(pt) +{ + let free = pt.lastlba + 1 - pt.firstlba; + for (const part of pt.partitions) + { + free -= part.size; + } + free *= pt.sectorsize; + return free; +} + +async function system_or_die(cmd, input = '') +{ + let [ exitcode, stdout, stderr ] = await system(cmd, input); + if (exitcode != 0) + { + console.error(cmd+' failed: '+stderr); + process.exit(1); + } + return stdout; +} + +async function system(cmd, input = '') +{ + if (options.debug) + { + process.stderr.write('+ '+cmd+(input ? " < stdout += buf.toString()); + cp.stderr.on('data', buf => stderr += buf.toString()); + cp.on('exit', () => finish_cb && finish_cb()); + cp.stdin.write(input); + cp.stdin.end(); + if (cp.exitCode == null) + { + await new Promise(ok => finish_cb = ok); + } + return [ cp.exitCode, stdout, stderr ]; +} + +async function file_exists(filename) +{ + return new Promise((ok, no) => fs.access(filename, fs.constants.R_OK, err => ok(!err))); +} diff --git a/mon/simple-offsets.js b/mon/simple-offsets.js index dd9337764..e2ae667db 100644 --- a/mon/simple-offsets.js +++ b/mon/simple-offsets.js @@ -49,7 +49,8 @@ async function run() } options.journal_offset = Math.ceil(options.journal_offset/options.device_block_size)*options.device_block_size; const meta_offset = options.journal_offset + Math.ceil(options.journal_size/options.device_block_size)*options.device_block_size; - const entries_per_block = Math.floor(options.device_block_size / (24 + 2*options.object_size/options.bitmap_granularity/8)); + const meta_entry_size = 24 + 2*options.object_size/options.bitmap_granularity/8; + const entries_per_block = Math.floor(options.device_block_size / meta_entry_size); const object_count = Math.floor((device_size-meta_offset)/options.object_size); const meta_size = Math.ceil(1 + object_count / entries_per_block) * options.device_block_size; const data_offset = meta_offset + meta_size;