forked from vitalif/vitastor
Compare commits
3 Commits
master
...
msgr-iothr
Author | SHA1 | Date |
---|---|---|
Vitaliy Filippov | 1d6f1203f8 | |
Vitaliy Filippov | d4dc3f4a9d | |
Vitaliy Filippov | 0c0439b33b |
|
@ -11,8 +11,7 @@ To enable Vitastor support in an OpenStack installation:
|
|||
- Install vitastor-client, patched QEMU and libvirt packages from Vitastor DEB or RPM repository
|
||||
- Use `patches/nova-21.diff` or `patches/nova-23.diff` to patch your Nova installation.
|
||||
Patch 21 fits Nova 21-22, patch 23 fits Nova 23-24.
|
||||
- Install `patches/cinder-vitastor-21.py` or `pathces/cinder-vitastor-22.py` as `..../cinder/volume/drivers/vitastor.py`
|
||||
Patch 21 fits Cinder up 21 (zed), Patch 22 fits Cinder after 22 (2023.1)
|
||||
- Install `patches/cinder-vitastor.py` as `..../cinder/volume/drivers/vitastor.py`
|
||||
- Define a volume type in cinder.conf (see below)
|
||||
- Block network access from VMs to Vitastor network (to OSDs and etcd),
|
||||
because Vitastor doesn't support authentication
|
||||
|
|
|
@ -11,8 +11,7 @@
|
|||
- Установите пакеты vitastor-client, libvirt и QEMU из DEB или RPM репозитория Vitastor
|
||||
- Примените патч `patches/nova-21.diff` или `patches/nova-23.diff` к вашей инсталляции Nova.
|
||||
nova-21.diff подходит для Nova 21-22, nova-23.diff подходит для Nova 23-24.
|
||||
- Скопируйте `patches/cinder-vitastor-21.py` или `pathces/cinder-vitastor-22.py` в инсталляцию Cinder как `cinder/volume/drivers/vitastor.py`.
|
||||
`cinder-vitastor-21.py` подходит для Cinder 21 (zed) и младше, `cinder-vitastor-22.py` подходит для Cinder 22 (2023.1) и старше.
|
||||
- Скопируйте `patches/cinder-vitastor.py` в инсталляцию Cinder как `cinder/volume/drivers/vitastor.py`
|
||||
- Создайте тип томов в cinder.conf (см. ниже)
|
||||
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
|
||||
- Перезапустите Cinder и Nova
|
||||
|
|
|
@ -1,966 +0,0 @@
|
|||
# Vitastor Driver for OpenStack Cinder
|
||||
#
|
||||
# --------------------------------------------
|
||||
# Install as cinder/volume/drivers/vitastor.py
|
||||
# --------------------------------------------
|
||||
#
|
||||
# Copyright 2020 Vitaliy Filippov
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
"""Cinder Vitastor Driver"""
|
||||
|
||||
import binascii
|
||||
import base64
|
||||
import errno
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from castellan import key_manager
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log as logging
|
||||
from oslo_service import loopingcall
|
||||
from oslo_concurrency import processutils
|
||||
from oslo_utils import encodeutils
|
||||
from oslo_utils import excutils
|
||||
from oslo_utils import fileutils
|
||||
from oslo_utils import units
|
||||
import six
|
||||
from six.moves.urllib import request
|
||||
|
||||
from cinder import exception
|
||||
from cinder.i18n import _
|
||||
from cinder.image import image_utils
|
||||
from cinder import interface
|
||||
from cinder import objects
|
||||
from cinder.objects import fields
|
||||
from cinder import utils
|
||||
from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VERSION = '1.4.1'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
VITASTOR_OPTS = [
|
||||
cfg.StrOpt(
|
||||
'vitastor_config_path',
|
||||
default='/etc/vitastor/vitastor.conf',
|
||||
help='Vitastor configuration file path'
|
||||
),
|
||||
cfg.StrOpt(
|
||||
'vitastor_etcd_address',
|
||||
default='',
|
||||
help='Vitastor etcd address(es)'),
|
||||
cfg.StrOpt(
|
||||
'vitastor_etcd_prefix',
|
||||
default='/vitastor',
|
||||
help='Vitastor etcd prefix'
|
||||
),
|
||||
cfg.StrOpt(
|
||||
'vitastor_pool_id',
|
||||
default='',
|
||||
help='Vitastor pool ID to use for volumes'
|
||||
),
|
||||
# FIXME exclusive_cinder_pool ?
|
||||
]
|
||||
|
||||
CONF = cfg.CONF
|
||||
CONF.register_opts(VITASTOR_OPTS, group = configuration.SHARED_CONF_GROUP)
|
||||
|
||||
class VitastorDriverException(exception.VolumeDriverException):
|
||||
message = _("Vitastor Cinder driver failure: %(reason)s")
|
||||
|
||||
@interface.volumedriver
|
||||
class VitastorDriver(driver.CloneableImageVD,
|
||||
driver.ManageableVD, driver.ManageableSnapshotsVD,
|
||||
driver.BaseVD):
|
||||
"""Implements Vitastor volume commands."""
|
||||
|
||||
cfg = {}
|
||||
_etcd_urls = []
|
||||
|
||||
def __init__(self, active_backend_id = None, *args, **kwargs):
|
||||
super(VitastorDriver, self).__init__(*args, **kwargs)
|
||||
self.configuration.append_config_values(VITASTOR_OPTS)
|
||||
|
||||
@classmethod
|
||||
def get_driver_options(cls):
|
||||
additional_opts = cls._get_oslo_driver_opts(
|
||||
'reserved_percentage',
|
||||
'max_over_subscription_ratio',
|
||||
'volume_dd_blocksize'
|
||||
)
|
||||
return VITASTOR_OPTS + additional_opts
|
||||
|
||||
def do_setup(self, context):
|
||||
"""Performs initialization steps that could raise exceptions."""
|
||||
super(VitastorDriver, self).do_setup(context)
|
||||
# Make sure configuration is in UTF-8
|
||||
for attr in [ 'config_path', 'etcd_address', 'etcd_prefix', 'pool_id' ]:
|
||||
val = self.configuration.safe_get('vitastor_'+attr)
|
||||
if val is not None:
|
||||
self.cfg[attr] = utils.convert_str(val)
|
||||
self.cfg = self._load_config(self.cfg)
|
||||
|
||||
def _load_config(self, cfg):
|
||||
# Try to load configuration file
|
||||
try:
|
||||
f = open(cfg['config_path'] or '/etc/vitastor/vitastor.conf')
|
||||
conf = json.loads(f.read())
|
||||
f.close()
|
||||
for k in conf:
|
||||
cfg[k] = cfg.get(k, conf[k])
|
||||
except:
|
||||
pass
|
||||
if isinstance(cfg['etcd_address'], str):
|
||||
cfg['etcd_address'] = cfg['etcd_address'].split(',')
|
||||
# Sanitize etcd URLs
|
||||
for i, etcd_url in enumerate(cfg['etcd_address']):
|
||||
ssl = False
|
||||
if etcd_url.lower().startswith('http://'):
|
||||
etcd_url = etcd_url[7:]
|
||||
elif etcd_url.lower().startswith('https://'):
|
||||
etcd_url = etcd_url[8:]
|
||||
ssl = True
|
||||
if etcd_url.find('/') < 0:
|
||||
etcd_url += '/v3'
|
||||
if ssl:
|
||||
etcd_url = 'https://'+etcd_url
|
||||
else:
|
||||
etcd_url = 'http://'+etcd_url
|
||||
cfg['etcd_address'][i] = etcd_url
|
||||
return cfg
|
||||
|
||||
def check_for_setup_error(self):
|
||||
"""Returns an error if prerequisites aren't met."""
|
||||
|
||||
def _encode_etcd_key(self, key):
|
||||
if not isinstance(key, bytes):
|
||||
key = str(key).encode('utf-8')
|
||||
return base64.b64encode(self.cfg['etcd_prefix'].encode('utf-8')+b'/'+key).decode('utf-8')
|
||||
|
||||
def _encode_etcd_value(self, value):
|
||||
if not isinstance(value, bytes):
|
||||
value = str(value).encode('utf-8')
|
||||
return base64.b64encode(value).decode('utf-8')
|
||||
|
||||
def _encode_etcd_requests(self, obj):
|
||||
for v in obj:
|
||||
for rt in v:
|
||||
if 'key' in v[rt]:
|
||||
v[rt]['key'] = self._encode_etcd_key(v[rt]['key'])
|
||||
if 'range_end' in v[rt]:
|
||||
v[rt]['range_end'] = self._encode_etcd_key(v[rt]['range_end'])
|
||||
if 'value' in v[rt]:
|
||||
v[rt]['value'] = self._encode_etcd_value(v[rt]['value'])
|
||||
|
||||
def _etcd_txn(self, params):
|
||||
if 'compare' in params:
|
||||
for v in params['compare']:
|
||||
if 'key' in v:
|
||||
v['key'] = self._encode_etcd_key(v['key'])
|
||||
if 'failure' in params:
|
||||
self._encode_etcd_requests(params['failure'])
|
||||
if 'success' in params:
|
||||
self._encode_etcd_requests(params['success'])
|
||||
body = json.dumps(params).encode('utf-8')
|
||||
headers = {
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
err = None
|
||||
for etcd_url in self.cfg['etcd_address']:
|
||||
try:
|
||||
resp = request.urlopen(request.Request(etcd_url+'/kv/txn', body, headers), timeout = 5)
|
||||
data = json.loads(resp.read())
|
||||
if 'responses' not in data:
|
||||
data['responses'] = []
|
||||
for i, resp in enumerate(data['responses']):
|
||||
if 'response_range' in resp:
|
||||
if 'kvs' not in resp['response_range']:
|
||||
resp['response_range']['kvs'] = []
|
||||
for kv in resp['response_range']['kvs']:
|
||||
kv['key'] = base64.b64decode(kv['key'].encode('utf-8')).decode('utf-8')
|
||||
if kv['key'].startswith(self.cfg['etcd_prefix']+'/'):
|
||||
kv['key'] = kv['key'][len(self.cfg['etcd_prefix'])+1 : ]
|
||||
kv['value'] = json.loads(base64.b64decode(kv['value'].encode('utf-8')))
|
||||
if len(resp.keys()) != 1:
|
||||
LOG.exception('unknown responses['+str(i)+'] format: '+json.dumps(resp))
|
||||
else:
|
||||
resp = data['responses'][i] = resp[list(resp.keys())[0]]
|
||||
return data
|
||||
except Exception as e:
|
||||
LOG.exception('error calling etcd transaction: '+body.decode('utf-8')+'\nerror: '+str(e))
|
||||
err = e
|
||||
raise err
|
||||
|
||||
def _etcd_foreach(self, prefix, add_fn):
|
||||
total = 0
|
||||
batch = 1000
|
||||
begin = prefix+'/'
|
||||
while True:
|
||||
resp = self._etcd_txn({ 'success': [
|
||||
{ 'request_range': {
|
||||
'key': begin,
|
||||
'range_end': prefix+'0',
|
||||
'limit': batch+1,
|
||||
} },
|
||||
] })
|
||||
i = 0
|
||||
while i < batch and i < len(resp['responses'][0]['kvs']):
|
||||
kv = resp['responses'][0]['kvs'][i]
|
||||
add_fn(kv)
|
||||
i += 1
|
||||
if len(resp['responses'][0]['kvs']) <= batch:
|
||||
break
|
||||
begin = resp['responses'][0]['kvs'][batch]['key']
|
||||
return total
|
||||
|
||||
def _update_volume_stats(self):
|
||||
location_info = json.dumps({
|
||||
'config': self.configuration.vitastor_config_path,
|
||||
'etcd_address': self.configuration.vitastor_etcd_address,
|
||||
'etcd_prefix': self.configuration.vitastor_etcd_prefix,
|
||||
'pool_id': self.configuration.vitastor_pool_id,
|
||||
})
|
||||
|
||||
stats = {
|
||||
'vendor_name': 'Vitastor',
|
||||
'driver_version': self.VERSION,
|
||||
'storage_protocol': 'vitastor',
|
||||
'total_capacity_gb': 'unknown',
|
||||
'free_capacity_gb': 'unknown',
|
||||
# FIXME check if safe_get is required
|
||||
'reserved_percentage': self.configuration.safe_get('reserved_percentage'),
|
||||
'multiattach': True,
|
||||
'thin_provisioning_support': True,
|
||||
'max_over_subscription_ratio': self.configuration.safe_get('max_over_subscription_ratio'),
|
||||
'location_info': location_info,
|
||||
'backend_state': 'down',
|
||||
'volume_backend_name': self.configuration.safe_get('volume_backend_name') or 'vitastor',
|
||||
'replication_enabled': False,
|
||||
}
|
||||
|
||||
try:
|
||||
pool_stats = self._etcd_txn({ 'success': [
|
||||
{ 'request_range': { 'key': 'pool/stats/'+str(self.cfg['pool_id']) } }
|
||||
] })
|
||||
total_provisioned = 0
|
||||
def add_total(kv):
|
||||
nonlocal total_provisioned
|
||||
if kv['key'].find('@') >= 0:
|
||||
total_provisioned += kv['value']['size']
|
||||
self._etcd_foreach('config/inode/'+str(self.cfg['pool_id']), lambda kv: add_total(kv))
|
||||
stats['provisioned_capacity_gb'] = round(total_provisioned/1024.0/1024.0/1024.0, 2)
|
||||
pool_stats = pool_stats['responses'][0]['kvs']
|
||||
if len(pool_stats):
|
||||
pool_stats = pool_stats[0]['value']
|
||||
stats['free_capacity_gb'] = round(1024.0*(pool_stats['total_raw_tb']-pool_stats['used_raw_tb'])/pool_stats['raw_to_usable'], 2)
|
||||
stats['total_capacity_gb'] = round(1024.0*pool_stats['total_raw_tb'], 2)
|
||||
stats['backend_state'] = 'up'
|
||||
except Exception as e:
|
||||
# just log and return unknown capacities
|
||||
LOG.exception('error getting vitastor pool stats: '+str(e))
|
||||
|
||||
self._stats = stats
|
||||
|
||||
def get_volume_stats(self, refresh=False):
|
||||
"""Get volume stats.
|
||||
If 'refresh' is True, run update the stats first.
|
||||
"""
|
||||
if not self._stats or refresh:
|
||||
self._update_volume_stats()
|
||||
|
||||
return self._stats
|
||||
|
||||
def _next_id(self, resp):
|
||||
if len(resp['kvs']) == 0:
|
||||
return (1, 0)
|
||||
else:
|
||||
return (1 + resp['kvs'][0]['value'], resp['kvs'][0]['mod_revision'])
|
||||
|
||||
def create_volume(self, volume):
|
||||
"""Creates a logical volume."""
|
||||
|
||||
size = int(volume.size) * units.Gi
|
||||
# FIXME: Check if convert_str is really required
|
||||
vol_name = utils.convert_str(volume.name)
|
||||
if vol_name.find('@') >= 0 or vol_name.find('/') >= 0:
|
||||
raise exception.VolumeBackendAPIException(data = '@ and / are forbidden in volume and snapshot names')
|
||||
|
||||
LOG.debug("creating volume '%s'", vol_name)
|
||||
|
||||
self._create_image(vol_name, { 'size': size })
|
||||
|
||||
if volume.encryption_key_id:
|
||||
self._create_encrypted_volume(volume, volume.obj_context)
|
||||
|
||||
volume_update = {}
|
||||
return volume_update
|
||||
|
||||
def _create_encrypted_volume(self, volume, context):
|
||||
"""Create a new LUKS encrypted image directly in Vitastor."""
|
||||
vol_name = utils.convert_str(volume.name)
|
||||
f, opts = self._encrypt_opts(volume, context)
|
||||
# FIXME: Check if it works at all :-)
|
||||
self._execute(
|
||||
'qemu-img', 'convert', '-f', 'luks', *opts,
|
||||
'vitastor:image='+vol_name.replace(':', '\\:')+self._qemu_args(),
|
||||
'%sM' % (volume.size * 1024)
|
||||
)
|
||||
f.close()
|
||||
|
||||
def _encrypt_opts(self, volume, context):
|
||||
encryption = volume_utils.check_encryption_provider(self.db, volume, context)
|
||||
# Fetch the key associated with the volume and decode the passphrase
|
||||
keymgr = key_manager.API(CONF)
|
||||
key = keymgr.get(context, encryption['encryption_key_id'])
|
||||
passphrase = binascii.hexlify(key.get_encoded()).decode('utf-8')
|
||||
# Decode the dm-crypt style cipher spec into something qemu-img can use
|
||||
cipher_spec = image_utils.decode_cipher(encryption['cipher'], encryption['key_size'])
|
||||
tmp_dir = volume_utils.image_conversion_dir()
|
||||
f = tempfile.NamedTemporaryFile(prefix = 'luks_', dir = tmp_dir)
|
||||
f.write(passphrase)
|
||||
f.flush()
|
||||
return (f, [
|
||||
'--object', 'secret,id=luks_sec,format=raw,file=%(passfile)s' % {'passfile': f.name},
|
||||
'-o', 'key-secret=luks_sec,cipher-alg=%(cipher_alg)s,cipher-mode=%(cipher_mode)s,ivgen-alg=%(ivgen_alg)s' % cipher_spec,
|
||||
])
|
||||
|
||||
def create_snapshot(self, snapshot):
|
||||
"""Creates a volume snapshot."""
|
||||
|
||||
vol_name = utils.convert_str(snapshot.volume_name)
|
||||
snap_name = utils.convert_str(snapshot.name)
|
||||
if snap_name.find('@') >= 0 or snap_name.find('/') >= 0:
|
||||
raise exception.VolumeBackendAPIException(data = '@ and / are forbidden in volume and snapshot names')
|
||||
self._create_snapshot(vol_name, vol_name+'@'+snap_name)
|
||||
|
||||
def snapshot_revert_use_temp_snapshot(self):
|
||||
"""Disable the use of a temporary snapshot on revert."""
|
||||
return False
|
||||
|
||||
def revert_to_snapshot(self, context, volume, snapshot):
|
||||
"""Revert a volume to a given snapshot."""
|
||||
|
||||
vol_name = utils.convert_str(snapshot.volume_name)
|
||||
snap_name = utils.convert_str(snapshot.name)
|
||||
|
||||
# Delete the image and recreate it from the snapshot
|
||||
args = [ 'vitastor-cli', 'rm', vol_name, *(self._vitastor_args()) ]
|
||||
try:
|
||||
self._execute(*args)
|
||||
except processutils.ProcessExecutionError as exc:
|
||||
LOG.error("Failed to delete image "+vol_name+": "+exc)
|
||||
raise exception.VolumeBackendAPIException(data = exc.stderr)
|
||||
args = [
|
||||
'vitastor-cli', 'create', '--parent', vol_name+'@'+snap_name,
|
||||
vol_name, *(self._vitastor_args())
|
||||
]
|
||||
try:
|
||||
self._execute(*args)
|
||||
except processutils.ProcessExecutionError as exc:
|
||||
LOG.error("Failed to recreate image "+vol_name+" from "+vol_name+"@"+snap_name+": "+exc)
|
||||
raise exception.VolumeBackendAPIException(data = exc.stderr)
|
||||
|
||||
def delete_snapshot(self, snapshot):
|
||||
"""Deletes a snapshot."""
|
||||
|
||||
vol_name = utils.convert_str(snapshot.volume_name)
|
||||
snap_name = utils.convert_str(snapshot.name)
|
||||
|
||||
args = [
|
||||
'vitastor-cli', 'rm', vol_name+'@'+snap_name,
|
||||
*(self._vitastor_args())
|
||||
]
|
||||
try:
|
||||
self._execute(*args)
|
||||
except processutils.ProcessExecutionError as exc:
|
||||
LOG.error("Failed to remove snapshot "+vol_name+'@'+snap_name+": "+exc)
|
||||
raise exception.VolumeBackendAPIException(data = exc.stderr)
|
||||
|
||||
def _child_count(self, parents):
|
||||
children = 0
|
||||
def add_child(kv):
|
||||
nonlocal children
|
||||
children += self._check_parent(kv, parents)
|
||||
self._etcd_foreach('config/inode', lambda kv: add_child(kv))
|
||||
return children
|
||||
|
||||
def _check_parent(self, kv, parents):
|
||||
if 'parent_id' not in kv['value']:
|
||||
return 0
|
||||
parent_id = kv['value']['parent_id']
|
||||
_, _, pool_id, inode_id = kv['key'].split('/')
|
||||
parent_pool_id = pool_id
|
||||
if 'parent_pool_id' in kv['value'] and kv['value']['parent_pool_id']:
|
||||
parent_pool_id = kv['value']['parent_pool_id']
|
||||
inode = (int(pool_id) << 48) | (int(inode_id) & 0xffffffffffff)
|
||||
parent = (int(parent_pool_id) << 48) | (int(parent_id) & 0xffffffffffff)
|
||||
if parent in parents and inode not in parents:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
def create_cloned_volume(self, volume, src_vref):
|
||||
"""Create a cloned volume from another volume."""
|
||||
|
||||
size = int(volume.size) * units.Gi
|
||||
src_name = utils.convert_str(src_vref.name)
|
||||
dest_name = utils.convert_str(volume.name)
|
||||
if dest_name.find('@') >= 0 or dest_name.find('/') >= 0:
|
||||
raise exception.VolumeBackendAPIException(data = '@ and / are forbidden in volume and snapshot names')
|
||||
|
||||
# FIXME Do full copy if requested (cfg.disable_clone)
|
||||
|
||||
if src_vref.admin_metadata.get('readonly') == 'True':
|
||||
# source volume is a volume-image cache entry or other readonly volume
|
||||
# clone without intermediate snapshot
|
||||
src = self._get_image(src_name)
|
||||
LOG.debug("creating image '%s' from '%s'", dest_name, src_name)
|
||||
new_cfg = self._create_image(dest_name, {
|
||||
'size': size,
|
||||
'parent_id': src['idx']['id'],
|
||||
'parent_pool_id': src['idx']['pool_id'],
|
||||
})
|
||||
return {}
|
||||
|
||||
clone_snap = "%s@%s.clone_snap" % (src_name, dest_name)
|
||||
make_img = True
|
||||
if (volume.display_name and
|
||||
volume.display_name.startswith('image-') and
|
||||
src_vref.project_id != volume.project_id):
|
||||
# idiotic openstack creates image-volume cache entries
|
||||
# as clones of normal VM volumes... :-X prevent it :-D
|
||||
clone_snap = dest_name
|
||||
make_img = False
|
||||
|
||||
LOG.debug("creating layer '%s' under '%s'", clone_snap, src_name)
|
||||
new_cfg = self._create_snapshot(src_name, clone_snap, True)
|
||||
if make_img:
|
||||
# Then create a clone from it
|
||||
new_cfg = self._create_image(dest_name, {
|
||||
'size': size,
|
||||
'parent_id': new_cfg['parent_id'],
|
||||
'parent_pool_id': new_cfg['parent_pool_id'],
|
||||
})
|
||||
|
||||
return {}
|
||||
|
||||
def create_volume_from_snapshot(self, volume, snapshot):
|
||||
"""Creates a cloned volume from an existing snapshot."""
|
||||
|
||||
vol_name = utils.convert_str(volume.name)
|
||||
snap_name = utils.convert_str(snapshot.name)
|
||||
|
||||
snap = self._get_image('volume-'+snapshot.volume_id+'@'+snap_name)
|
||||
if not snap:
|
||||
raise exception.SnapshotNotFound(snapshot_id = snap_name)
|
||||
snap_inode_id = int(resp['responses'][0]['kvs'][0]['value']['id'])
|
||||
snap_pool_id = int(resp['responses'][0]['kvs'][0]['value']['pool_id'])
|
||||
|
||||
size = snap['cfg']['size']
|
||||
if int(volume.size):
|
||||
size = int(volume.size) * units.Gi
|
||||
new_cfg = self._create_image(vol_name, {
|
||||
'size': size,
|
||||
'parent_id': snap['idx']['id'],
|
||||
'parent_pool_id': snap['idx']['pool_id'],
|
||||
})
|
||||
|
||||
return {}
|
||||
|
||||
def _vitastor_args(self):
|
||||
args = []
|
||||
for k in [ 'config_path', 'etcd_address', 'etcd_prefix' ]:
|
||||
v = self.configuration.safe_get('vitastor_'+k)
|
||||
if v:
|
||||
args.extend(['--'+k, v])
|
||||
return args
|
||||
|
||||
def _qemu_args(self):
|
||||
args = ''
|
||||
for k in [ 'config_path', 'etcd_address', 'etcd_prefix' ]:
|
||||
v = self.configuration.safe_get('vitastor_'+k)
|
||||
kk = k
|
||||
if kk == 'etcd_address':
|
||||
# FIXME use etcd_address in qemu driver
|
||||
kk = 'etcd_host'
|
||||
if v:
|
||||
args += ':'+kk.replace('_', '-')+'='+v.replace(':', '\\:')
|
||||
return args
|
||||
|
||||
def delete_volume(self, volume):
|
||||
"""Deletes a logical volume."""
|
||||
|
||||
vol_name = utils.convert_str(volume.name)
|
||||
|
||||
# Find the volume and all its snapshots
|
||||
range_end = b'index/image/' + vol_name.encode('utf-8')
|
||||
range_end = range_end[0 : len(range_end)-1] + six.int2byte(range_end[len(range_end)-1] + 1)
|
||||
resp = self._etcd_txn({ 'success': [
|
||||
{ 'request_range': { 'key': 'index/image/'+vol_name, 'range_end': range_end } },
|
||||
] })
|
||||
if len(resp['responses'][0]['kvs']) == 0:
|
||||
# already deleted
|
||||
LOG.info("volume %s no longer exists in backend", vol_name)
|
||||
return
|
||||
layers = resp['responses'][0]['kvs']
|
||||
layer_ids = {}
|
||||
for kv in layers:
|
||||
inode_id = int(kv['value']['id'])
|
||||
pool_id = int(kv['value']['pool_id'])
|
||||
inode_pool_id = (pool_id << 48) | (inode_id & 0xffffffffffff)
|
||||
layer_ids[inode_pool_id] = True
|
||||
|
||||
# Check if the volume has clones and raise 'busy' if so
|
||||
children = self._child_count(layer_ids)
|
||||
if children > 0:
|
||||
raise exception.VolumeIsBusy(volume_name = vol_name)
|
||||
|
||||
# Clear data
|
||||
for kv in layers:
|
||||
args = [
|
||||
'vitastor-cli', 'rm-data', '--pool', str(kv['value']['pool_id']),
|
||||
'--inode', str(kv['value']['id']), '--progress', '0',
|
||||
*(self._vitastor_args())
|
||||
]
|
||||
try:
|
||||
self._execute(*args)
|
||||
except processutils.ProcessExecutionError as exc:
|
||||
LOG.error("Failed to remove layer "+kv['key']+": "+exc)
|
||||
raise exception.VolumeBackendAPIException(data = exc.stderr)
|
||||
|
||||
# Delete all layers from etcd
|
||||
requests = []
|
||||
for kv in layers:
|
||||
requests.append({ 'request_delete_range': { 'key': kv['key'] } })
|
||||
requests.append({ 'request_delete_range': { 'key': 'config/inode/'+str(kv['value']['pool_id'])+'/'+str(kv['value']['id']) } })
|
||||
self._etcd_txn({ 'success': requests })
|
||||
|
||||
def retype(self, context, volume, new_type, diff, host):
|
||||
"""Change extra type specifications for a volume."""
|
||||
|
||||
# FIXME Maybe (in the future) support multiple pools as different types
|
||||
return True, {}
|
||||
|
||||
def ensure_export(self, context, volume):
|
||||
"""Synchronously recreates an export for a logical volume."""
|
||||
pass
|
||||
|
||||
def create_export(self, context, volume, connector):
|
||||
"""Exports the volume."""
|
||||
pass
|
||||
|
||||
def remove_export(self, context, volume):
|
||||
"""Removes an export for a logical volume."""
|
||||
pass
|
||||
|
||||
def _create_image(self, vol_name, cfg):
|
||||
pool_s = str(self.cfg['pool_id'])
|
||||
image_id = 0
|
||||
while image_id == 0:
|
||||
# check if the image already exists and find a free ID
|
||||
resp = self._etcd_txn({ 'success': [
|
||||
{ 'request_range': { 'key': 'index/image/'+vol_name } },
|
||||
{ 'request_range': { 'key': 'index/maxid/'+pool_s } },
|
||||
] })
|
||||
if len(resp['responses'][0]['kvs']) > 0:
|
||||
# already exists
|
||||
raise exception.VolumeBackendAPIException(data = 'Volume '+vol_name+' already exists')
|
||||
image_id, id_mod = self._next_id(resp['responses'][1])
|
||||
# try to create the image
|
||||
resp = self._etcd_txn({ 'compare': [
|
||||
{ 'target': 'MOD', 'mod_revision': id_mod, 'key': 'index/maxid/'+pool_s },
|
||||
{ 'target': 'VERSION', 'version': 0, 'key': 'index/image/'+vol_name },
|
||||
{ 'target': 'VERSION', 'version': 0, 'key': 'config/inode/'+pool_s+'/'+str(image_id) },
|
||||
], 'success': [
|
||||
{ 'request_put': { 'key': 'index/maxid/'+pool_s, 'value': image_id } },
|
||||
{ 'request_put': { 'key': 'index/image/'+vol_name, 'value': json.dumps({
|
||||
'id': image_id, 'pool_id': self.cfg['pool_id']
|
||||
}) } },
|
||||
{ 'request_put': { 'key': 'config/inode/'+pool_s+'/'+str(image_id), 'value': json.dumps({
|
||||
**cfg, 'name': vol_name,
|
||||
}) } },
|
||||
] })
|
||||
if not resp.get('succeeded'):
|
||||
# repeat
|
||||
image_id = 0
|
||||
|
||||
def _create_snapshot(self, vol_name, snap_vol_name, allow_existing = False):
|
||||
while True:
|
||||
# check if the image already exists and snapshot doesn't
|
||||
resp = self._etcd_txn({ 'success': [
|
||||
{ 'request_range': { 'key': 'index/image/'+vol_name } },
|
||||
{ 'request_range': { 'key': 'index/image/'+snap_vol_name } },
|
||||
] })
|
||||
if len(resp['responses'][0]['kvs']) == 0:
|
||||
raise exception.VolumeBackendAPIException(data = 'Volume '+vol_name+' does not exist')
|
||||
if len(resp['responses'][1]['kvs']) > 0:
|
||||
if allow_existing:
|
||||
snap_idx = resp['responses'][1]['kvs'][0]['value']
|
||||
resp = self._etcd_txn({ 'success': [
|
||||
{ 'request_range': { 'key': 'config/inode/'+str(snap_idx['pool_id'])+'/'+str(snap_idx['id']) } },
|
||||
] })
|
||||
if len(resp['responses'][0]['kvs']) == 0:
|
||||
raise exception.VolumeBackendAPIException(data =
|
||||
'Volume '+snap_vol_name+' is already indexed, but does not exist'
|
||||
)
|
||||
return resp['responses'][0]['kvs'][0]['value']
|
||||
raise exception.VolumeBackendAPIException(
|
||||
data = 'Volume '+snap_vol_name+' already exists'
|
||||
)
|
||||
vol_idx = resp['responses'][0]['kvs'][0]['value']
|
||||
vol_idx_mod = resp['responses'][0]['kvs'][0]['mod_revision']
|
||||
# get image inode config and find a new ID
|
||||
resp = self._etcd_txn({ 'success': [
|
||||
{ 'request_range': { 'key': 'config/inode/'+str(vol_idx['pool_id'])+'/'+str(vol_idx['id']) } },
|
||||
{ 'request_range': { 'key': 'index/maxid/'+str(self.cfg['pool_id']) } },
|
||||
] })
|
||||
if len(resp['responses'][0]['kvs']) == 0:
|
||||
raise exception.VolumeBackendAPIException(data = 'Volume '+vol_name+' does not exist')
|
||||
vol_cfg = resp['responses'][0]['kvs'][0]['value']
|
||||
vol_mod = resp['responses'][0]['kvs'][0]['mod_revision']
|
||||
new_id, id_mod = self._next_id(resp['responses'][1])
|
||||
# try to redirect image to the new inode
|
||||
new_cfg = {
|
||||
**vol_cfg, 'name': vol_name, 'parent_id': vol_idx['id'], 'parent_pool_id': vol_idx['pool_id']
|
||||
}
|
||||
resp = self._etcd_txn({ 'compare': [
|
||||
{ 'target': 'MOD', 'mod_revision': vol_idx_mod, 'key': 'index/image/'+vol_name },
|
||||
{ 'target': 'MOD', 'mod_revision': vol_mod, 'key': 'config/inode/'+str(vol_idx['pool_id'])+'/'+str(vol_idx['id']) },
|
||||
{ 'target': 'MOD', 'mod_revision': id_mod, 'key': 'index/maxid/'+str(self.cfg['pool_id']) },
|
||||
{ 'target': 'VERSION', 'version': 0, 'key': 'index/image/'+snap_vol_name },
|
||||
{ 'target': 'VERSION', 'version': 0, 'key': 'config/inode/'+str(self.cfg['pool_id'])+'/'+str(new_id) },
|
||||
], 'success': [
|
||||
{ 'request_put': { 'key': 'index/maxid/'+str(self.cfg['pool_id']), 'value': new_id } },
|
||||
{ 'request_put': { 'key': 'index/image/'+vol_name, 'value': json.dumps({
|
||||
'id': new_id, 'pool_id': self.cfg['pool_id']
|
||||
}) } },
|
||||
{ 'request_put': { 'key': 'config/inode/'+str(self.cfg['pool_id'])+'/'+str(new_id), 'value': json.dumps(new_cfg) } },
|
||||
{ 'request_put': { 'key': 'index/image/'+snap_vol_name, 'value': json.dumps({
|
||||
'id': vol_idx['id'], 'pool_id': vol_idx['pool_id']
|
||||
}) } },
|
||||
{ 'request_put': { 'key': 'config/inode/'+str(vol_idx['pool_id'])+'/'+str(vol_idx['id']), 'value': json.dumps({
|
||||
**vol_cfg, 'name': snap_vol_name, 'readonly': True
|
||||
}) } }
|
||||
] })
|
||||
if resp.get('succeeded'):
|
||||
return new_cfg
|
||||
|
||||
def initialize_connection(self, volume, connector):
|
||||
data = {
|
||||
'driver_volume_type': 'vitastor',
|
||||
'data': {
|
||||
'config_path': self.configuration.vitastor_config_path,
|
||||
'etcd_address': self.configuration.vitastor_etcd_address,
|
||||
'etcd_prefix': self.configuration.vitastor_etcd_prefix,
|
||||
'name': volume.name,
|
||||
'logical_block_size': '512',
|
||||
'physical_block_size': '4096',
|
||||
}
|
||||
}
|
||||
LOG.debug('connection data: %s', data)
|
||||
return data
|
||||
|
||||
def terminate_connection(self, volume, connector, **kwargs):
|
||||
pass
|
||||
|
||||
def clone_image(self, context, volume, image_location, image_meta, image_service):
|
||||
if image_location:
|
||||
# Note: image_location[0] is glance image direct_url.
|
||||
# image_location[1] contains the list of all locations (including
|
||||
# direct_url) or None if show_multiple_locations is False in
|
||||
# glance configuration.
|
||||
if image_location[1]:
|
||||
url_locations = [location['url'] for location in image_location[1]]
|
||||
else:
|
||||
url_locations = [image_location[0]]
|
||||
# iterate all locations to look for a cloneable one.
|
||||
for url_location in url_locations:
|
||||
if url_location and url_location.startswith('cinder://'):
|
||||
# The idea is to use cinder://<volume-id> Glance volumes as base images
|
||||
base_vol = self.db.volume_get(context, url_location[len('cinder://') : ])
|
||||
if not base_vol or base_vol.volume_type_id != volume.volume_type_id:
|
||||
continue
|
||||
size = int(volume.size) * units.Gi
|
||||
dest_name = utils.convert_str(volume.name)
|
||||
# Find or create the base snapshot
|
||||
snap_cfg = self._create_snapshot(base_vol.name, base_vol.name+'@.clone_snap', True)
|
||||
# Then create a clone from it
|
||||
new_cfg = self._create_image(dest_name, {
|
||||
'size': size,
|
||||
'parent_id': snap_cfg['parent_id'],
|
||||
'parent_pool_id': snap_cfg['parent_pool_id'],
|
||||
})
|
||||
return ({}, True)
|
||||
return ({}, False)
|
||||
|
||||
def copy_image_to_encrypted_volume(self, context, volume, image_service, image_id):
|
||||
self.copy_image_to_volume(context, volume, image_service, image_id, encrypted = True)
|
||||
|
||||
def copy_image_to_volume(self, context, volume, image_service, image_id, encrypted = False, disable_sparse = False):
|
||||
tmp_dir = volume_utils.image_conversion_dir()
|
||||
with tempfile.NamedTemporaryFile(dir = tmp_dir) as tmp:
|
||||
image_utils.fetch_to_raw(
|
||||
context, image_service, image_id, tmp.name,
|
||||
self.configuration.volume_dd_blocksize, size = volume.size
|
||||
)
|
||||
out_format = [ '-O', 'raw' ]
|
||||
if encrypted:
|
||||
key_file, opts = self._encrypt_opts(volume, context)
|
||||
out_format = [ '-O', 'luks', *opts ]
|
||||
dest_name = utils.convert_str(volume.name)
|
||||
self._try_execute(
|
||||
'qemu-img', 'convert', '-f', 'raw', tmp.name, *out_format,
|
||||
'vitastor:image='+dest_name.replace(':', '\\:')+self._qemu_args()
|
||||
)
|
||||
if encrypted:
|
||||
key_file.close()
|
||||
|
||||
def copy_volume_to_image(self, context, volume, image_service, image_meta):
|
||||
tmp_dir = volume_utils.image_conversion_dir()
|
||||
tmp_file = os.path.join(tmp_dir, volume.name + '-' + image_meta['id'])
|
||||
with fileutils.remove_path_on_error(tmp_file):
|
||||
vol_name = utils.convert_str(volume.name)
|
||||
self._try_execute(
|
||||
'qemu-img', 'convert', '-f', 'raw',
|
||||
'vitastor:image='+vol_name.replace(':', '\\:')+self._qemu_args(),
|
||||
'-O', 'raw', tmp_file
|
||||
)
|
||||
# FIXME: Copy directly if the destination image is also in Vitastor
|
||||
volume_utils.upload_volume(context, image_service, image_meta, tmp_file, volume)
|
||||
os.unlink(tmp_file)
|
||||
|
||||
def _get_image(self, vol_name):
|
||||
# find the image
|
||||
resp = self._etcd_txn({ 'success': [
|
||||
{ 'request_range': { 'key': 'index/image/'+vol_name } },
|
||||
] })
|
||||
if len(resp['responses'][0]['kvs']) == 0:
|
||||
return None
|
||||
vol_idx = resp['responses'][0]['kvs'][0]['value']
|
||||
vol_idx_mod = resp['responses'][0]['kvs'][0]['mod_revision']
|
||||
# get image inode config
|
||||
resp = self._etcd_txn({ 'success': [
|
||||
{ 'request_range': { 'key': 'config/inode/'+str(vol_idx['pool_id'])+'/'+str(vol_idx['id']) } },
|
||||
] })
|
||||
if len(resp['responses'][0]['kvs']) == 0:
|
||||
return None
|
||||
vol_cfg = resp['responses'][0]['kvs'][0]['value']
|
||||
vol_cfg_mod = resp['responses'][0]['kvs'][0]['mod_revision']
|
||||
return {
|
||||
'cfg': vol_cfg,
|
||||
'cfg_mod': vol_cfg_mod,
|
||||
'idx': vol_idx,
|
||||
'idx_mod': vol_idx_mod,
|
||||
}
|
||||
|
||||
def extend_volume(self, volume, new_size):
|
||||
"""Extend an existing volume."""
|
||||
vol_name = utils.convert_str(volume.name)
|
||||
while True:
|
||||
vol = self._get_image(vol_name)
|
||||
if not vol:
|
||||
raise exception.VolumeBackendAPIException(data = 'Volume '+vol_name+' does not exist')
|
||||
# change size
|
||||
size = int(new_size) * units.Gi
|
||||
if size == vol['cfg']['size']:
|
||||
break
|
||||
resp = self._etcd_txn({ 'compare': [ {
|
||||
'target': 'MOD',
|
||||
'mod_revision': vol['cfg_mod'],
|
||||
'key': 'config/inode/'+str(vol['idx']['pool_id'])+'/'+str(vol['idx']['id']),
|
||||
} ], 'success': [
|
||||
{ 'request_put': {
|
||||
'key': 'config/inode/'+str(vol['idx']['pool_id'])+'/'+str(vol['idx']['id']),
|
||||
'value': json.dumps({ **vol['cfg'], 'size': size }),
|
||||
} },
|
||||
] })
|
||||
if resp.get('succeeded'):
|
||||
break
|
||||
LOG.debug(
|
||||
"Extend volume from %(old_size)s GB to %(new_size)s GB.",
|
||||
{'old_size': volume.size, 'new_size': new_size}
|
||||
)
|
||||
|
||||
def _add_manageable_volume(self, kv, manageable_volumes, cinder_ids):
|
||||
cfg = kv['value']
|
||||
if kv['key'].find('@') >= 0:
|
||||
# snapshot
|
||||
return
|
||||
image_id = volume_utils.extract_id_from_volume_name(cfg['name'])
|
||||
image_info = {
|
||||
'reference': {'source-name': image_name},
|
||||
'size': int(math.ceil(float(cfg['size']) / units.Gi)),
|
||||
'cinder_id': None,
|
||||
'extra_info': None,
|
||||
}
|
||||
if image_id in cinder_ids:
|
||||
image_info['cinder_id'] = image_id
|
||||
image_info['safe_to_manage'] = False
|
||||
image_info['reason_not_safe'] = 'already managed'
|
||||
else:
|
||||
image_info['safe_to_manage'] = True
|
||||
image_info['reason_not_safe'] = None
|
||||
manageable_volumes.append(image_info)
|
||||
|
||||
def get_manageable_volumes(self, cinder_volumes, marker, limit, offset, sort_keys, sort_dirs):
|
||||
manageable_volumes = []
|
||||
cinder_ids = [resource['id'] for resource in cinder_volumes]
|
||||
|
||||
# List all volumes
|
||||
# FIXME: It's possible to use pagination in our case, but.. do we want it?
|
||||
self._etcd_foreach('config/inode/'+str(self.cfg['pool_id']),
|
||||
lambda kv: self._add_manageable_volume(kv, manageable_volumes, cinder_ids))
|
||||
|
||||
return volume_utils.paginate_entries_list(
|
||||
manageable_volumes, marker, limit, offset, sort_keys, sort_dirs)
|
||||
|
||||
def _get_existing_name(existing_ref):
|
||||
if not isinstance(existing_ref, dict):
|
||||
existing_ref = {"source-name": existing_ref}
|
||||
if 'source-name' not in existing_ref:
|
||||
reason = _('Reference must contain source-name element.')
|
||||
raise exception.ManageExistingInvalidReference(existing_ref=existing_ref, reason=reason)
|
||||
src_name = utils.convert_str(existing_ref['source-name'])
|
||||
if not src_name:
|
||||
reason = _('Reference must contain source-name element.')
|
||||
raise exception.ManageExistingInvalidReference(existing_ref=existing_ref, reason=reason)
|
||||
return src_name
|
||||
|
||||
def manage_existing_get_size(self, volume, existing_ref):
|
||||
"""Return size of an existing image for manage_existing.
|
||||
|
||||
:param volume: volume ref info to be set
|
||||
:param existing_ref: {'source-name': <image name>}
|
||||
"""
|
||||
src_name = self._get_existing_name(existing_ref)
|
||||
vol = self._get_image(src_name)
|
||||
if not vol:
|
||||
raise exception.VolumeBackendAPIException(data = 'Volume '+src_name+' does not exist')
|
||||
return int(math.ceil(float(vol['cfg']['size']) / units.Gi))
|
||||
|
||||
def manage_existing(self, volume, existing_ref):
|
||||
"""Manages an existing image.
|
||||
|
||||
Renames the image name to match the expected name for the volume.
|
||||
|
||||
:param volume: volume ref info to be set
|
||||
:param existing_ref: {'source-name': <image name>}
|
||||
"""
|
||||
from_name = self._get_existing_name(existing_ref)
|
||||
to_name = utils.convert_str(volume.name)
|
||||
self._rename(from_name, to_name)
|
||||
|
||||
def _rename(self, from_name, to_name):
|
||||
while True:
|
||||
vol = self._get_image(from_name)
|
||||
if not vol:
|
||||
raise exception.VolumeBackendAPIException(data = 'Volume '+from_name+' does not exist')
|
||||
to = self._get_image(to_name)
|
||||
if to:
|
||||
raise exception.VolumeBackendAPIException(data = 'Volume '+to_name+' already exists')
|
||||
resp = self._etcd_txn({ 'compare': [
|
||||
{ 'target': 'MOD', 'mod_revision': vol['idx_mod'], 'key': 'index/image/'+vol['cfg']['name'] },
|
||||
{ 'target': 'MOD', 'mod_revision': vol['cfg_mod'], 'key': 'config/inode/'+str(vol['idx']['pool_id'])+'/'+str(vol['idx']['id']) },
|
||||
{ 'target': 'VERSION', 'version': 0, 'key': 'index/image/'+to_name },
|
||||
], 'success': [
|
||||
{ 'request_delete_range': { 'key': 'index/image/'+vol['cfg']['name'] } },
|
||||
{ 'request_put': { 'key': 'index/image/'+to_name, 'value': json.dumps(vol['idx']) } },
|
||||
{ 'request_put': { 'key': 'config/inode/'+str(vol['idx']['pool_id'])+'/'+str(vol['idx']['id']),
|
||||
'value': json.dumps({ **vol['cfg'], 'name': to_name }) } },
|
||||
] })
|
||||
if resp.get('succeeded'):
|
||||
break
|
||||
|
||||
def unmanage(self, volume):
|
||||
pass
|
||||
|
||||
def _add_manageable_snapshot(self, kv, manageable_snapshots, cinder_ids):
|
||||
cfg = kv['value']
|
||||
dog = kv['key'].find('@')
|
||||
if dog < 0:
|
||||
# snapshot
|
||||
return
|
||||
image_name = kv['key'][0 : dog]
|
||||
snap_name = kv['key'][dog+1 : ]
|
||||
snapshot_id = volume_utils.extract_id_from_snapshot_name(snap_name)
|
||||
snapshot_info = {
|
||||
'reference': {'source-name': snap_name},
|
||||
'size': int(math.ceil(float(cfg['size']) / units.Gi)),
|
||||
'cinder_id': None,
|
||||
'extra_info': None,
|
||||
'safe_to_manage': False,
|
||||
'reason_not_safe': None,
|
||||
'source_reference': {'source-name': image_name}
|
||||
}
|
||||
if snapshot_id in cinder_ids:
|
||||
# Exclude snapshots already managed.
|
||||
snapshot_info['reason_not_safe'] = ('already managed')
|
||||
snapshot_info['cinder_id'] = snapshot_id
|
||||
elif snap_name.endswith('.clone_snap'):
|
||||
# Exclude clone snapshot.
|
||||
snapshot_info['reason_not_safe'] = ('used for clone snap')
|
||||
else:
|
||||
snapshot_info['safe_to_manage'] = True
|
||||
manageable_snapshots.append(snapshot_info)
|
||||
|
||||
def get_manageable_snapshots(self, cinder_snapshots, marker, limit, offset, sort_keys, sort_dirs):
|
||||
"""List manageable snapshots in Vitastor."""
|
||||
manageable_snapshots = []
|
||||
cinder_snapshot_ids = [resource['id'] for resource in cinder_snapshots]
|
||||
# List all volumes
|
||||
# FIXME: It's possible to use pagination in our case, but.. do we want it?
|
||||
self._etcd_foreach('config/inode/'+str(self.cfg['pool_id']),
|
||||
lambda kv: self._add_manageable_volume(kv, manageable_snapshots, cinder_snapshot_ids))
|
||||
return volume_utils.paginate_entries_list(
|
||||
manageable_snapshots, marker, limit, offset, sort_keys, sort_dirs)
|
||||
|
||||
def manage_existing_snapshot_get_size(self, snapshot, existing_ref):
|
||||
"""Return size of an existing image for manage_existing.
|
||||
|
||||
:param snapshot: snapshot ref info to be set
|
||||
:param existing_ref: {'source-name': <name of snapshot>}
|
||||
"""
|
||||
vol_name = utils.convert_str(snapshot.volume_name)
|
||||
snap_name = self._get_existing_name(existing_ref)
|
||||
vol = self._get_image(vol_name+'@'+snap_name)
|
||||
if not vol:
|
||||
raise exception.ManageExistingInvalidReference(
|
||||
existing_ref=snapshot_name, reason='Specified snapshot does not exist.'
|
||||
)
|
||||
return int(math.ceil(float(vol['cfg']['size']) / units.Gi))
|
||||
|
||||
def manage_existing_snapshot(self, snapshot, existing_ref):
|
||||
"""Manages an existing snapshot.
|
||||
|
||||
Renames the snapshot name to match the expected name for the snapshot.
|
||||
Error checking done by manage_existing_get_size is not repeated.
|
||||
|
||||
:param snapshot: snapshot ref info to be set
|
||||
:param existing_ref: {'source-name': <name of snapshot>}
|
||||
"""
|
||||
vol_name = utils.convert_str(snapshot.volume_name)
|
||||
snap_name = self._get_existing_name(existing_ref)
|
||||
from_name = vol_name+'@'+snap_name
|
||||
to_name = vol_name+'@'+utils.convert_str(snapshot.name)
|
||||
self._rename(from_name, to_name)
|
||||
|
||||
def unmanage_snapshot(self, snapshot):
|
||||
"""Removes the specified snapshot from Cinder management."""
|
||||
pass
|
||||
|
||||
def _dumps(self, obj):
|
||||
return json.dumps(obj, separators=(',', ':'), sort_keys=True)
|
|
@ -1,643 +0,0 @@
|
|||
commit 1f7e90e36b2afca0312392979b96d31951a8d66b
|
||||
Author: Vitaliy Filippov <vitalif@yourcmc.ru>
|
||||
Date: Thu Jun 27 01:34:54 2024 +0300
|
||||
|
||||
Add Vitastor support
|
||||
|
||||
diff --git a/include/libvirt/libvirt-storage.h b/include/libvirt/libvirt-storage.h
|
||||
index aaad4a3da1..5f5daa8341 100644
|
||||
--- a/include/libvirt/libvirt-storage.h
|
||||
+++ b/include/libvirt/libvirt-storage.h
|
||||
@@ -326,6 +326,7 @@ typedef enum {
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17, /* (Since: 1.2.8) */
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18, /* (Since: 3.1.0) */
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19, /* (Since: 5.6.0) */
|
||||
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20, /* (Since: 5.0.0) */
|
||||
} virConnectListAllStoragePoolsFlags;
|
||||
|
||||
int virConnectListAllStoragePools(virConnectPtr conn,
|
||||
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
|
||||
index fde594f811..66537db3e3 100644
|
||||
--- a/src/conf/domain_conf.c
|
||||
+++ b/src/conf/domain_conf.c
|
||||
@@ -7220,7 +7220,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
|
||||
src->configFile = virXPathString("string(./config/@file)", ctxt);
|
||||
|
||||
if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP ||
|
||||
- src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS)
|
||||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS ||
|
||||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR)
|
||||
src->query = virXMLPropString(node, "query");
|
||||
|
||||
if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0)
|
||||
@@ -30734,6 +30735,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSource *src,
|
||||
|
||||
case VIR_STORAGE_POOL_MPATH:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_SHEEPDOG:
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c
|
||||
index 395e036e8f..8a0190f85b 100644
|
||||
--- a/src/conf/domain_validate.c
|
||||
+++ b/src/conf/domain_validate.c
|
||||
@@ -495,6 +495,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
@@ -541,7 +542,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
}
|
||||
}
|
||||
|
||||
- /* internal snapshots and config files are currently supported only with rbd: */
|
||||
+ /* internal snapshots are currently supported only with rbd: */
|
||||
if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
|
||||
src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
|
||||
if (src->snapshot) {
|
||||
@@ -549,10 +550,15 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
|
||||
_("<snapshot> element is currently supported only with 'rbd' disks"));
|
||||
return -1;
|
||||
}
|
||||
+ }
|
||||
|
||||
+ /* config files are currently supported only with rbd and vitastor: */
|
||||
+ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
|
||||
if (src->configFile) {
|
||||
virReportError(VIR_ERR_XML_ERROR, "%s",
|
||||
- _("<config> element is currently supported only with 'rbd' disks"));
|
||||
+ _("<config> element is currently supported only with 'rbd' and 'vitastor' disks"));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
diff --git a/src/conf/schemas/domaincommon.rng b/src/conf/schemas/domaincommon.rng
|
||||
index a46a824f88..4c5b720643 100644
|
||||
--- a/src/conf/schemas/domaincommon.rng
|
||||
+++ b/src/conf/schemas/domaincommon.rng
|
||||
@@ -1997,6 +1997,35 @@
|
||||
</element>
|
||||
</define>
|
||||
|
||||
+ <define name="diskSourceNetworkProtocolVitastor">
|
||||
+ <element name="source">
|
||||
+ <interleave>
|
||||
+ <attribute name="protocol">
|
||||
+ <value>vitastor</value>
|
||||
+ </attribute>
|
||||
+ <ref name="diskSourceCommon"/>
|
||||
+ <optional>
|
||||
+ <attribute name="name"/>
|
||||
+ </optional>
|
||||
+ <optional>
|
||||
+ <attribute name="query"/>
|
||||
+ </optional>
|
||||
+ <zeroOrMore>
|
||||
+ <ref name="diskSourceNetworkHost"/>
|
||||
+ </zeroOrMore>
|
||||
+ <optional>
|
||||
+ <element name="config">
|
||||
+ <attribute name="file">
|
||||
+ <ref name="absFilePath"/>
|
||||
+ </attribute>
|
||||
+ <empty/>
|
||||
+ </element>
|
||||
+ </optional>
|
||||
+ <empty/>
|
||||
+ </interleave>
|
||||
+ </element>
|
||||
+ </define>
|
||||
+
|
||||
<define name="diskSourceNetworkProtocolISCSI">
|
||||
<element name="source">
|
||||
<attribute name="protocol">
|
||||
@@ -2347,6 +2376,7 @@
|
||||
<ref name="diskSourceNetworkProtocolSimple"/>
|
||||
<ref name="diskSourceNetworkProtocolVxHS"/>
|
||||
<ref name="diskSourceNetworkProtocolNFS"/>
|
||||
+ <ref name="diskSourceNetworkProtocolVitastor"/>
|
||||
</choice>
|
||||
</define>
|
||||
|
||||
diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c
|
||||
index 68842004b7..1d69a788b6 100644
|
||||
--- a/src/conf/storage_conf.c
|
||||
+++ b/src/conf/storage_conf.c
|
||||
@@ -56,7 +56,7 @@ VIR_ENUM_IMPL(virStoragePool,
|
||||
"logical", "disk", "iscsi",
|
||||
"iscsi-direct", "scsi", "mpath",
|
||||
"rbd", "sheepdog", "gluster",
|
||||
- "zfs", "vstorage",
|
||||
+ "zfs", "vstorage", "vitastor",
|
||||
);
|
||||
|
||||
VIR_ENUM_IMPL(virStoragePoolFormatFileSystem,
|
||||
@@ -242,6 +242,18 @@ static virStoragePoolTypeInfo poolTypeInfo[] = {
|
||||
.formatToString = virStorageFileFormatTypeToString,
|
||||
}
|
||||
},
|
||||
+ {.poolType = VIR_STORAGE_POOL_VITASTOR,
|
||||
+ .poolOptions = {
|
||||
+ .flags = (VIR_STORAGE_POOL_SOURCE_HOST |
|
||||
+ VIR_STORAGE_POOL_SOURCE_NETWORK |
|
||||
+ VIR_STORAGE_POOL_SOURCE_NAME),
|
||||
+ },
|
||||
+ .volOptions = {
|
||||
+ .defaultFormat = VIR_STORAGE_FILE_RAW,
|
||||
+ .formatFromString = virStorageVolumeFormatFromString,
|
||||
+ .formatToString = virStorageFileFormatTypeToString,
|
||||
+ }
|
||||
+ },
|
||||
{.poolType = VIR_STORAGE_POOL_SHEEPDOG,
|
||||
.poolOptions = {
|
||||
.flags = (VIR_STORAGE_POOL_SOURCE_HOST |
|
||||
@@ -538,6 +550,11 @@ virStoragePoolDefParseSource(xmlXPathContextPtr ctxt,
|
||||
_("element 'name' is mandatory for RBD pool"));
|
||||
return -1;
|
||||
}
|
||||
+ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) {
|
||||
+ virReportError(VIR_ERR_XML_ERROR, "%s",
|
||||
+ _("element 'name' is mandatory for Vitastor pool"));
|
||||
+ return -1;
|
||||
+ }
|
||||
|
||||
if (options->formatFromString) {
|
||||
g_autofree char *format = NULL;
|
||||
@@ -1127,6 +1144,7 @@ virStoragePoolDefFormatBuf(virBuffer *buf,
|
||||
/* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor
|
||||
* files, so they don't have a target */
|
||||
if (def->type != VIR_STORAGE_POOL_RBD &&
|
||||
+ def->type != VIR_STORAGE_POOL_VITASTOR &&
|
||||
def->type != VIR_STORAGE_POOL_SHEEPDOG &&
|
||||
def->type != VIR_STORAGE_POOL_GLUSTER &&
|
||||
def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) {
|
||||
diff --git a/src/conf/storage_conf.h b/src/conf/storage_conf.h
|
||||
index fc67957cfe..720c07ef74 100644
|
||||
--- a/src/conf/storage_conf.h
|
||||
+++ b/src/conf/storage_conf.h
|
||||
@@ -103,6 +103,7 @@ typedef enum {
|
||||
VIR_STORAGE_POOL_GLUSTER, /* Gluster device */
|
||||
VIR_STORAGE_POOL_ZFS, /* ZFS */
|
||||
VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */
|
||||
+ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */
|
||||
|
||||
VIR_STORAGE_POOL_LAST,
|
||||
} virStoragePoolType;
|
||||
@@ -454,6 +455,7 @@ VIR_ENUM_DECL(virStoragePartedFs);
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \
|
||||
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \
|
||||
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \
|
||||
diff --git a/src/conf/storage_source_conf.c b/src/conf/storage_source_conf.c
|
||||
index 959ec5ed40..e751dd4d6a 100644
|
||||
--- a/src/conf/storage_source_conf.c
|
||||
+++ b/src/conf/storage_source_conf.c
|
||||
@@ -88,6 +88,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol,
|
||||
"ssh",
|
||||
"vxhs",
|
||||
"nfs",
|
||||
+ "vitastor",
|
||||
);
|
||||
|
||||
|
||||
@@ -1301,6 +1302,7 @@ virStorageSourceNetworkDefaultPort(virStorageNetProtocol protocol)
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
return 24007;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
/* we don't provide a default for RBD */
|
||||
return 0;
|
||||
diff --git a/src/conf/storage_source_conf.h b/src/conf/storage_source_conf.h
|
||||
index 05b4bda16c..b5ed143c39 100644
|
||||
--- a/src/conf/storage_source_conf.h
|
||||
+++ b/src/conf/storage_source_conf.h
|
||||
@@ -129,6 +129,7 @@ typedef enum {
|
||||
VIR_STORAGE_NET_PROTOCOL_SSH,
|
||||
VIR_STORAGE_NET_PROTOCOL_VXHS,
|
||||
VIR_STORAGE_NET_PROTOCOL_NFS,
|
||||
+ VIR_STORAGE_NET_PROTOCOL_VITASTOR,
|
||||
|
||||
VIR_STORAGE_NET_PROTOCOL_LAST
|
||||
} virStorageNetProtocol;
|
||||
diff --git a/src/conf/virstorageobj.c b/src/conf/virstorageobj.c
|
||||
index 59fa5da372..4739167f5f 100644
|
||||
--- a/src/conf/virstorageobj.c
|
||||
+++ b/src/conf/virstorageobj.c
|
||||
@@ -1438,6 +1438,7 @@ virStoragePoolObjSourceFindDuplicateCb(const void *payload,
|
||||
return 1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_ISCSI_DIRECT:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
@@ -1921,6 +1922,8 @@ virStoragePoolObjMatch(virStoragePoolObj *obj,
|
||||
(obj->def->type == VIR_STORAGE_POOL_MPATH)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) &&
|
||||
(obj->def->type == VIR_STORAGE_POOL_RBD)) ||
|
||||
+ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) &&
|
||||
+ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) &&
|
||||
(obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) ||
|
||||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) &&
|
||||
diff --git a/src/libvirt-storage.c b/src/libvirt-storage.c
|
||||
index db7660aac4..561df34709 100644
|
||||
--- a/src/libvirt-storage.c
|
||||
+++ b/src/libvirt-storage.c
|
||||
@@ -94,6 +94,7 @@ virStoragePoolGetConnect(virStoragePoolPtr pool)
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_SCSI
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_MPATH
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_RBD
|
||||
+ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER
|
||||
* VIR_CONNECT_LIST_STORAGE_POOLS_ZFS
|
||||
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c
|
||||
index 62e1be6672..71a1d42896 100644
|
||||
--- a/src/libxl/libxl_conf.c
|
||||
+++ b/src/libxl/libxl_conf.c
|
||||
@@ -979,6 +979,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSource *src,
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
virReportError(VIR_ERR_NO_SUPPORT,
|
||||
diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c
|
||||
index 53f6871efc..c34b8cee1a 100644
|
||||
--- a/src/libxl/xen_xl.c
|
||||
+++ b/src/libxl/xen_xl.c
|
||||
@@ -1456,6 +1456,7 @@ xenFormatXLDiskSrcNet(virStorageSource *src)
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
virReportError(VIR_ERR_NO_SUPPORT,
|
||||
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
|
||||
index 738b72d7ea..5dd082fc89 100644
|
||||
--- a/src/qemu/qemu_block.c
|
||||
+++ b/src/qemu/qemu_block.c
|
||||
@@ -758,6 +758,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src,
|
||||
}
|
||||
|
||||
|
||||
+static virJSONValue *
|
||||
+qemuBlockStorageSourceGetVitastorProps(virStorageSource *src)
|
||||
+{
|
||||
+ virJSONValue *ret = NULL;
|
||||
+ virStorageNetHostDef *host;
|
||||
+ size_t i;
|
||||
+ g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
|
||||
+ g_autofree char *etcd = NULL;
|
||||
+
|
||||
+ for (i = 0; i < src->nhosts; i++) {
|
||||
+ host = src->hosts + i;
|
||||
+ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) {
|
||||
+ return NULL;
|
||||
+ }
|
||||
+ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port);
|
||||
+ }
|
||||
+ if (src->nhosts > 0) {
|
||||
+ etcd = virBufferContentAndReset(&buf);
|
||||
+ }
|
||||
+
|
||||
+ if (virJSONValueObjectAdd(&ret,
|
||||
+ "S:etcd-host", etcd,
|
||||
+ "S:etcd-prefix", src->query,
|
||||
+ "S:config-path", src->configFile,
|
||||
+ "s:image", src->path,
|
||||
+ NULL) < 0)
|
||||
+ return NULL;
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+
|
||||
static virJSONValue *
|
||||
qemuBlockStorageSourceGetSheepdogProps(virStorageSource *src)
|
||||
{
|
||||
@@ -1140,6 +1172,12 @@ qemuBlockStorageSourceGetBackendProps(virStorageSource *src,
|
||||
return NULL;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ driver = "vitastor";
|
||||
+ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src)))
|
||||
+ return NULL;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
driver = "sheepdog";
|
||||
if (!(fileprops = qemuBlockStorageSourceGetSheepdogProps(src)))
|
||||
@@ -2020,6 +2058,7 @@ qemuBlockGetBackingStoreString(virStorageSource *src,
|
||||
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_VXHS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NFS:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SSH:
|
||||
@@ -2400,6 +2439,12 @@ qemuBlockStorageSourceCreateGetStorageProps(virStorageSource *src,
|
||||
return -1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ driver = "vitastor";
|
||||
+ if (!(location = qemuBlockStorageSourceGetVitastorProps(src)))
|
||||
+ return -1;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
driver = "sheepdog";
|
||||
if (!(location = qemuBlockStorageSourceGetSheepdogProps(src)))
|
||||
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
|
||||
index bda62f2e5c..84b4e5f2b8 100644
|
||||
--- a/src/qemu/qemu_domain.c
|
||||
+++ b/src/qemu/qemu_domain.c
|
||||
@@ -5260,7 +5260,8 @@ qemuDomainValidateStorageSource(virStorageSource *src,
|
||||
if (src->query &&
|
||||
(actualType != VIR_STORAGE_TYPE_NETWORK ||
|
||||
(src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS &&
|
||||
- src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP))) {
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP &&
|
||||
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) {
|
||||
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
|
||||
_("query is supported only with HTTP(S) protocols"));
|
||||
return -1;
|
||||
@@ -10514,6 +10515,7 @@ qemuDomainPrepareStorageSourceTLS(virStorageSource *src,
|
||||
break;
|
||||
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c
|
||||
index f5260c4a22..2f9d8406fe 100644
|
||||
--- a/src/qemu/qemu_snapshot.c
|
||||
+++ b/src/qemu/qemu_snapshot.c
|
||||
@@ -423,6 +423,7 @@ qemuSnapshotPrepareDiskExternalInactive(virDomainSnapshotDiskDef *snapdisk,
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
@@ -648,6 +649,7 @@ qemuSnapshotPrepareDiskInternal(virDomainDiskDef *disk,
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NBD:
|
||||
case VIR_STORAGE_NET_PROTOCOL_RBD:
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
|
||||
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
|
||||
diff --git a/src/storage/storage_driver.c b/src/storage/storage_driver.c
|
||||
index 86c03762d2..630c6eff1a 100644
|
||||
--- a/src/storage/storage_driver.c
|
||||
+++ b/src/storage/storage_driver.c
|
||||
@@ -1626,6 +1626,7 @@ storageVolLookupByPathCallback(virStoragePoolObj *obj,
|
||||
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_SHEEPDOG:
|
||||
case VIR_STORAGE_POOL_ZFS:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c
|
||||
index 80681924ea..8a3ade9ec0 100644
|
||||
--- a/src/storage_file/storage_source_backingstore.c
|
||||
+++ b/src/storage_file/storage_source_backingstore.c
|
||||
@@ -287,6 +287,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr,
|
||||
}
|
||||
|
||||
|
||||
+static int
|
||||
+virStorageSourceParseVitastorColonString(const char *colonstr,
|
||||
+ virStorageSource *src)
|
||||
+{
|
||||
+ char *p, *e, *next;
|
||||
+ g_autofree char *options = NULL;
|
||||
+
|
||||
+ /* optionally skip the "vitastor:" prefix if provided */
|
||||
+ if (STRPREFIX(colonstr, "vitastor:"))
|
||||
+ colonstr += strlen("vitastor:");
|
||||
+
|
||||
+ options = g_strdup(colonstr);
|
||||
+
|
||||
+ p = options;
|
||||
+ while (*p) {
|
||||
+ /* find : delimiter or end of string */
|
||||
+ for (e = p; *e && *e != ':'; ++e) {
|
||||
+ if (*e == '\\') {
|
||||
+ e++;
|
||||
+ if (*e == '\0')
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ if (*e == '\0') {
|
||||
+ next = e; /* last kv pair */
|
||||
+ } else {
|
||||
+ next = e + 1;
|
||||
+ *e = '\0';
|
||||
+ }
|
||||
+
|
||||
+ if (STRPREFIX(p, "image=")) {
|
||||
+ src->path = g_strdup(p + strlen("image="));
|
||||
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
|
||||
+ src->query = g_strdup(p + strlen("etcd-prefix="));
|
||||
+ } else if (STRPREFIX(p, "config-path=")) {
|
||||
+ src->configFile = g_strdup(p + strlen("config-path="));
|
||||
+ } else if (STRPREFIX(p, "etcd-host=")) {
|
||||
+ char *h, *sep;
|
||||
+
|
||||
+ h = p + strlen("etcd-host=");
|
||||
+ while (h < e) {
|
||||
+ for (sep = h; sep < e; ++sep) {
|
||||
+ if (*sep == '\\' && (sep[1] == ',' ||
|
||||
+ sep[1] == ';' ||
|
||||
+ sep[1] == ' ')) {
|
||||
+ *sep = '\0';
|
||||
+ sep += 2;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (virStorageSourceRBDAddHost(src, h) < 0)
|
||||
+ return -1;
|
||||
+
|
||||
+ h = sep;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ p = next;
|
||||
+ }
|
||||
+
|
||||
+ if (!src->path) {
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+
|
||||
static int
|
||||
virStorageSourceParseNBDColonString(const char *nbdstr,
|
||||
virStorageSource *src)
|
||||
@@ -399,6 +468,11 @@ virStorageSourceParseBackingColon(virStorageSource *src,
|
||||
return -1;
|
||||
break;
|
||||
|
||||
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
|
||||
+ if (virStorageSourceParseVitastorColonString(path, src) < 0)
|
||||
+ return -1;
|
||||
+ break;
|
||||
+
|
||||
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
|
||||
case VIR_STORAGE_NET_PROTOCOL_LAST:
|
||||
case VIR_STORAGE_NET_PROTOCOL_NONE:
|
||||
@@ -975,6 +1049,54 @@ virStorageSourceParseBackingJSONRBD(virStorageSource *src,
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static int
|
||||
+virStorageSourceParseBackingJSONVitastor(virStorageSource *src,
|
||||
+ virJSONValue *json,
|
||||
+ const char *jsonstr G_GNUC_UNUSED,
|
||||
+ int opaque G_GNUC_UNUSED)
|
||||
+{
|
||||
+ const char *filename;
|
||||
+ const char *image = virJSONValueObjectGetString(json, "image");
|
||||
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
|
||||
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
|
||||
+ virJSONValue *servers = virJSONValueObjectGetArray(json, "server");
|
||||
+ size_t nservers;
|
||||
+ size_t i;
|
||||
+
|
||||
+ src->type = VIR_STORAGE_TYPE_NETWORK;
|
||||
+ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR;
|
||||
+
|
||||
+ /* legacy syntax passed via 'filename' option */
|
||||
+ if ((filename = virJSONValueObjectGetString(json, "filename")))
|
||||
+ return virStorageSourceParseVitastorColonString(filename, src);
|
||||
+
|
||||
+ if (!image) {
|
||||
+ virReportError(VIR_ERR_INVALID_ARG, "%s",
|
||||
+ _("missing image name in Vitastor backing volume "
|
||||
+ "JSON specification"));
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ src->path = g_strdup(image);
|
||||
+ src->configFile = g_strdup(conf);
|
||||
+ src->query = g_strdup(etcd_prefix);
|
||||
+
|
||||
+ if (servers) {
|
||||
+ nservers = virJSONValueArraySize(servers);
|
||||
+
|
||||
+ src->hosts = g_new0(virStorageNetHostDef, nservers);
|
||||
+ src->nhosts = nservers;
|
||||
+
|
||||
+ for (i = 0; i < nservers; i++) {
|
||||
+ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i,
|
||||
+ virJSONValueArrayGet(servers, i)) < 0)
|
||||
+ return -1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static int
|
||||
virStorageSourceParseBackingJSONRaw(virStorageSource *src,
|
||||
virJSONValue *json,
|
||||
@@ -1152,6 +1274,7 @@ static const struct virStorageSourceJSONDriverParser jsonParsers[] = {
|
||||
{"sheepdog", false, virStorageSourceParseBackingJSONSheepdog, 0},
|
||||
{"ssh", false, virStorageSourceParseBackingJSONSSH, 0},
|
||||
{"rbd", false, virStorageSourceParseBackingJSONRBD, 0},
|
||||
+ {"vitastor", false, virStorageSourceParseBackingJSONVitastor, 0},
|
||||
{"raw", true, virStorageSourceParseBackingJSONRaw, 0},
|
||||
{"nfs", false, virStorageSourceParseBackingJSONNFS, 0},
|
||||
{"vxhs", false, virStorageSourceParseBackingJSONVxHS, 0},
|
||||
diff --git a/src/test/test_driver.c b/src/test/test_driver.c
|
||||
index d2d1bc43e3..31a92e4a01 100644
|
||||
--- a/src/test/test_driver.c
|
||||
+++ b/src/test/test_driver.c
|
||||
@@ -7339,6 +7339,7 @@ testStorageVolumeTypeForPool(int pooltype)
|
||||
case VIR_STORAGE_POOL_ISCSI_DIRECT:
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_RBD:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
return VIR_STORAGE_VOL_NETWORK;
|
||||
case VIR_STORAGE_POOL_LOGICAL:
|
||||
case VIR_STORAGE_POOL_DISK:
|
||||
diff --git a/tests/storagepoolcapsschemadata/poolcaps-fs.xml b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
index eee75af746..8bd0a57bdd 100644
|
||||
--- a/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
+++ b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
|
||||
@@ -204,4 +204,11 @@
|
||||
</enum>
|
||||
</volOptions>
|
||||
</pool>
|
||||
+ <pool type='vitastor' supported='no'>
|
||||
+ <volOptions>
|
||||
+ <defaultFormat type='raw'/>
|
||||
+ <enum name='targetFormatType'>
|
||||
+ </enum>
|
||||
+ </volOptions>
|
||||
+ </pool>
|
||||
</storagepoolCapabilities>
|
||||
diff --git a/tests/storagepoolcapsschemadata/poolcaps-full.xml b/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
index 805950a937..852df0de16 100644
|
||||
--- a/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
+++ b/tests/storagepoolcapsschemadata/poolcaps-full.xml
|
||||
@@ -204,4 +204,11 @@
|
||||
</enum>
|
||||
</volOptions>
|
||||
</pool>
|
||||
+ <pool type='vitastor' supported='yes'>
|
||||
+ <volOptions>
|
||||
+ <defaultFormat type='raw'/>
|
||||
+ <enum name='targetFormatType'>
|
||||
+ </enum>
|
||||
+ </volOptions>
|
||||
+ </pool>
|
||||
</storagepoolCapabilities>
|
||||
diff --git a/tests/storagepoolxml2argvtest.c b/tests/storagepoolxml2argvtest.c
|
||||
index e8e40d695e..db55fe5f3a 100644
|
||||
--- a/tests/storagepoolxml2argvtest.c
|
||||
+++ b/tests/storagepoolxml2argvtest.c
|
||||
@@ -65,6 +65,7 @@ testCompareXMLToArgvFiles(bool shouldFail,
|
||||
case VIR_STORAGE_POOL_GLUSTER:
|
||||
case VIR_STORAGE_POOL_ZFS:
|
||||
case VIR_STORAGE_POOL_VSTORAGE:
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
default:
|
||||
VIR_TEST_DEBUG("pool type '%s' has no xml2argv test", defTypeStr);
|
||||
diff --git a/tools/virsh-pool.c b/tools/virsh-pool.c
|
||||
index f9aad8ded0..64704b4288 100644
|
||||
--- a/tools/virsh-pool.c
|
||||
+++ b/tools/virsh-pool.c
|
||||
@@ -1187,6 +1187,9 @@ cmdPoolList(vshControl *ctl, const vshCmd *cmd G_GNUC_UNUSED)
|
||||
case VIR_STORAGE_POOL_VSTORAGE:
|
||||
flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE;
|
||||
break;
|
||||
+ case VIR_STORAGE_POOL_VITASTOR:
|
||||
+ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR;
|
||||
+ break;
|
||||
case VIR_STORAGE_POOL_LAST:
|
||||
break;
|
||||
}
|
|
@ -366,7 +366,6 @@ resume_0:
|
|||
!flusher->flush_queue.size() || !flusher->dequeuing)
|
||||
{
|
||||
stop_flusher:
|
||||
flusher->dequeuing = false;
|
||||
if (flusher->trim_wanted > 0 && try_trim)
|
||||
{
|
||||
// Attempt forced trim
|
||||
|
@ -374,6 +373,7 @@ stop_flusher:
|
|||
flusher->active_flushers++;
|
||||
goto trim_journal;
|
||||
}
|
||||
flusher->dequeuing = false;
|
||||
wait_state = 0;
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -271,7 +271,7 @@ void http_co_t::close_connection()
|
|||
}
|
||||
if (peer_fd >= 0)
|
||||
{
|
||||
tfd->set_fd_handler(peer_fd, false, NULL);
|
||||
tfd->set_fd_handler(peer_fd, 0, NULL);
|
||||
close(peer_fd);
|
||||
peer_fd = -1;
|
||||
}
|
||||
|
@ -314,7 +314,7 @@ void http_co_t::start_connection()
|
|||
stackout();
|
||||
return;
|
||||
}
|
||||
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
|
||||
tfd->set_fd_handler(peer_fd, EPOLLOUT, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
this->epoll_events |= epoll_events;
|
||||
handle_events();
|
||||
|
@ -372,7 +372,7 @@ void http_co_t::handle_connect_result()
|
|||
}
|
||||
int one = 1;
|
||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
tfd->set_fd_handler(peer_fd, EPOLLIN, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
this->epoll_events |= epoll_events;
|
||||
handle_events();
|
||||
|
|
|
@ -15,6 +15,140 @@
|
|||
#include "msgr_rdma.h"
|
||||
#endif
|
||||
|
||||
#include <sys/poll.h>
|
||||
#include <sys/eventfd.h>
|
||||
|
||||
msgr_iothread_t::msgr_iothread_t()
|
||||
{
|
||||
ring = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
|
||||
epmgr = new epoll_manager_t(ring);
|
||||
submit_eventfd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
|
||||
if (submit_eventfd < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("failed to create eventfd: ")+strerror(errno));
|
||||
}
|
||||
epmgr->tfd->set_fd_handler(submit_eventfd, false, [this](int fd, int epoll_events)
|
||||
{
|
||||
// Reset eventfd counter
|
||||
uint64_t ctr = 0;
|
||||
int r = read(submit_eventfd, &ctr, 8);
|
||||
if (r < 0 && errno != EAGAIN && errno != EINTR)
|
||||
{
|
||||
fprintf(stderr, "Error resetting eventfd: %s\n", strerror(errno));
|
||||
}
|
||||
// Submit pending events
|
||||
submit_pending_writes();
|
||||
});
|
||||
thread = new std::thread(&msgr_iothread_t::run, this);
|
||||
}
|
||||
|
||||
msgr_iothread_t::~msgr_iothread_t()
|
||||
{
|
||||
stop();
|
||||
delete thread;
|
||||
delete epmgr;
|
||||
delete ring;
|
||||
}
|
||||
|
||||
void msgr_iothread_t::stop()
|
||||
{
|
||||
mu.lock();
|
||||
if (stopped)
|
||||
{
|
||||
mu.unlock();
|
||||
return;
|
||||
}
|
||||
stopped = true;
|
||||
if (callback_stopped)
|
||||
{
|
||||
*callback_stopped = true;
|
||||
callback_stopped = NULL;
|
||||
}
|
||||
close(submit_eventfd);
|
||||
mu.unlock();
|
||||
thread->join();
|
||||
}
|
||||
|
||||
static uint64_t one = 1;
|
||||
|
||||
void msgr_iothread_t::wakeup(osd_client_t *cl, ring_loop_t *outer_ring)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mu);
|
||||
if (!pending_clients.size())
|
||||
{
|
||||
io_uring_sqe* sqe = outer_ring->get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
write(submit_eventfd, &one, sizeof(one));
|
||||
}
|
||||
else
|
||||
{
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
data->callback = [](ring_data_t*){};
|
||||
my_uring_prep_write(sqe, submit_eventfd, &one, sizeof(one), 0);
|
||||
}
|
||||
}
|
||||
add_pending(cl);
|
||||
}
|
||||
|
||||
void msgr_iothread_t::add_pending(osd_client_t *cl)
|
||||
{
|
||||
auto it = std::lower_bound(pending_clients.begin(), pending_clients.end(), cl);
|
||||
if (it == pending_clients.end() || *it != cl)
|
||||
{
|
||||
pending_clients.insert(it, cl);
|
||||
}
|
||||
}
|
||||
|
||||
void msgr_iothread_t::submit_pending_writes()
|
||||
{
|
||||
mu.lock();
|
||||
std::vector<osd_client_t*> clients = std::move(pending_clients);
|
||||
mu.unlock();
|
||||
for (int i = 0; i < clients.size(); i++)
|
||||
{
|
||||
auto cl = clients[i];
|
||||
std::unique_lock<std::mutex> lock(cl->mu);
|
||||
if (cl->peer_state == PEER_STOPPED)
|
||||
{
|
||||
cl->refs--;
|
||||
if (cl->refs <= 0)
|
||||
delete cl;
|
||||
continue;
|
||||
}
|
||||
if (!cl->try_send(ring, false/*, lock*/))
|
||||
{
|
||||
// ring is full (rare but what if...)
|
||||
ring->submit();
|
||||
mu.lock();
|
||||
for (; i < clients.size(); i++)
|
||||
{
|
||||
add_pending(clients[i]);
|
||||
}
|
||||
mu.unlock();
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
cl->refs--;
|
||||
}
|
||||
}
|
||||
ring->submit();
|
||||
}
|
||||
|
||||
void msgr_iothread_t::run()
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
mu.lock();
|
||||
if (stopped)
|
||||
return;
|
||||
mu.unlock();
|
||||
ring->loop();
|
||||
ring->wait();
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::init()
|
||||
{
|
||||
#ifdef WITH_RDMA
|
||||
|
@ -35,7 +169,7 @@ void osd_messenger_t::init()
|
|||
? rdma_max_sge : rdma_context->attrx.orig_attr.max_sge;
|
||||
fprintf(stderr, "[OSD %ju] RDMA initialized successfully\n", osd_num);
|
||||
fcntl(rdma_context->channel->fd, F_SETFL, fcntl(rdma_context->channel->fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
tfd->set_fd_handler(rdma_context->channel->fd, false, [this](int notify_fd, int epoll_events)
|
||||
tfd->set_fd_handler(rdma_context->channel->fd, EPOLLIN, [this](int notify_fd, int epoll_events)
|
||||
{
|
||||
handle_rdma_events();
|
||||
});
|
||||
|
@ -43,6 +177,19 @@ void osd_messenger_t::init()
|
|||
}
|
||||
}
|
||||
#endif
|
||||
if (ringloop && iothread_count > 0)
|
||||
{
|
||||
for (int i = 0; i < iothread_count; i++)
|
||||
{
|
||||
auto iot = new msgr_iothread_t();
|
||||
iothreads.push_back(iot);
|
||||
}
|
||||
completion_eventfd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
|
||||
if (completion_eventfd < 0)
|
||||
{
|
||||
throw std::runtime_error(std::string("failed to create completion eventfd: ")+strerror(errno));
|
||||
}
|
||||
}
|
||||
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
|
||||
{
|
||||
auto cl_it = clients.begin();
|
||||
|
@ -120,6 +267,11 @@ void osd_messenger_t::init()
|
|||
|
||||
osd_messenger_t::~osd_messenger_t()
|
||||
{
|
||||
if (completion_eventfd >= 0)
|
||||
{
|
||||
close(completion_eventfd);
|
||||
completion_eventfd = -1;
|
||||
}
|
||||
if (keepalive_timer_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(keepalive_timer_id);
|
||||
|
@ -129,6 +281,14 @@ osd_messenger_t::~osd_messenger_t()
|
|||
{
|
||||
stop_client(clients.begin()->first, true, true);
|
||||
}
|
||||
if (iothreads.size())
|
||||
{
|
||||
for (auto iot: iothreads)
|
||||
{
|
||||
delete iot;
|
||||
}
|
||||
iothreads.clear();
|
||||
}
|
||||
#ifdef WITH_RDMA
|
||||
if (rdma_context)
|
||||
{
|
||||
|
@ -262,7 +422,8 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
|
|||
clients[peer_fd]->connect_timeout_id = -1;
|
||||
clients[peer_fd]->osd_num = peer_osd;
|
||||
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
|
||||
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
|
||||
clients[peer_fd]->receive_buffer_size = receive_buffer_size;
|
||||
tfd->set_fd_handler(peer_fd, EPOLLOUT, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Either OUT (connected) or HUP
|
||||
handle_connect_epoll(peer_fd);
|
||||
|
@ -303,7 +464,7 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
|
|||
int one = 1;
|
||||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
|
||||
cl->peer_state = PEER_CONNECTED;
|
||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
tfd->set_fd_handler(peer_fd, EPOLLIN, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
handle_peer_epoll(peer_fd, epoll_events);
|
||||
});
|
||||
|
@ -327,6 +488,14 @@ void osd_messenger_t::handle_peer_epoll(int peer_fd, int epoll_events)
|
|||
{
|
||||
// Mark client as ready (i.e. some data is available)
|
||||
auto cl = clients[peer_fd];
|
||||
#ifdef WITH_RDMA
|
||||
if (cl->peer_state == PEER_RDMA)
|
||||
{
|
||||
// FIXME: Do something better than just forgetting the FD
|
||||
// FIXME: Ignore pings during RDMA state transition
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
cl->read_ready++;
|
||||
if (cl->read_ready == 1)
|
||||
{
|
||||
|
@ -487,7 +656,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
|||
fprintf(stderr, "Connected to OSD %ju using RDMA\n", cl->osd_num);
|
||||
}
|
||||
cl->peer_state = PEER_RDMA;
|
||||
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
tfd->set_fd_handler(cl->peer_fd, EPOLLIN, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Do not miss the disconnection!
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
|
@ -527,8 +696,9 @@ void osd_messenger_t::accept_connections(int listen_fd)
|
|||
clients[peer_fd]->peer_fd = peer_fd;
|
||||
clients[peer_fd]->peer_state = PEER_CONNECTED;
|
||||
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
|
||||
clients[peer_fd]->receive_buffer_size = receive_buffer_size;
|
||||
// Add FD to epoll
|
||||
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
tfd->set_fd_handler(peer_fd, EPOLLIN, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
handle_peer_epoll(peer_fd, epoll_events);
|
||||
});
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#include <map>
|
||||
#include <deque>
|
||||
#include <vector>
|
||||
#include <mutex>
|
||||
|
||||
#include "malloc_or_die.h"
|
||||
#include "json11/json11.hpp"
|
||||
|
@ -47,6 +48,8 @@ struct msgr_rdma_context_t;
|
|||
|
||||
struct osd_client_t
|
||||
{
|
||||
std::mutex mu;
|
||||
|
||||
int refs = 0;
|
||||
|
||||
sockaddr_storage peer_addr;
|
||||
|
@ -59,6 +62,7 @@ struct osd_client_t
|
|||
osd_num_t osd_num = 0;
|
||||
|
||||
void *in_buf = NULL;
|
||||
uint32_t receive_buffer_size = 0;
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
msgr_rdma_connection_t *rdma_conn = NULL;
|
||||
|
@ -89,6 +93,17 @@ struct osd_client_t
|
|||
std::vector<msgr_sendp_t> outbox, next_outbox;
|
||||
|
||||
~osd_client_t();
|
||||
|
||||
bool try_send(ring_loop_t *ringloop, bool use_sync_send_recv);
|
||||
void handle_send(int result);
|
||||
|
||||
bool try_recv(ring_loop_t *ringloop, bool use_sync_send_recv);
|
||||
bool handle_read(int result);
|
||||
bool handle_read_buffer(void *curbuf, int remain);
|
||||
bool handle_finished_read();
|
||||
void handle_op_hdr();
|
||||
bool handle_reply_hdr();
|
||||
void handle_reply_ready(osd_op_t *op);
|
||||
};
|
||||
|
||||
struct osd_wanted_peer_t
|
||||
|
@ -111,6 +126,53 @@ struct osd_op_stats_t
|
|||
uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
|
||||
};
|
||||
|
||||
#ifdef __MOCK__
|
||||
class msgr_iothread_t;
|
||||
#else
|
||||
|
||||
#include <thread>
|
||||
|
||||
#include "epoll_manager.h"
|
||||
|
||||
class msgr_iothread_t
|
||||
{
|
||||
protected:
|
||||
ring_loop_t *ring = NULL;
|
||||
epoll_manager_t *epmgr = NULL;
|
||||
int submit_eventfd = -1;
|
||||
bool stopped = false;
|
||||
bool pending = false;
|
||||
bool *callback_stopped = NULL;
|
||||
std::mutex mu;
|
||||
std::vector<osd_client_t*> pending_clients;
|
||||
std::thread *thread = NULL;
|
||||
|
||||
void run();
|
||||
public:
|
||||
|
||||
msgr_iothread_t();
|
||||
~msgr_iothread_t();
|
||||
|
||||
epoll_manager_t *get_epmgr()
|
||||
{
|
||||
return epmgr;
|
||||
}
|
||||
|
||||
int get_eventfd()
|
||||
{
|
||||
return submit_eventfd;
|
||||
}
|
||||
|
||||
void wakeup(osd_client_t *cl, ring_loop_t *outer_ring);
|
||||
|
||||
void add_pending(osd_client_t *cl);
|
||||
|
||||
void submit_pending_writes();
|
||||
|
||||
void stop();
|
||||
};
|
||||
#endif
|
||||
|
||||
struct osd_messenger_t
|
||||
{
|
||||
protected:
|
||||
|
@ -123,6 +185,7 @@ protected:
|
|||
int osd_ping_timeout = 0;
|
||||
int log_level = 0;
|
||||
bool use_sync_send_recv = false;
|
||||
int iothread_count = 4;
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
bool use_rdma = true;
|
||||
|
@ -134,10 +197,12 @@ protected:
|
|||
bool rdma_odp = false;
|
||||
#endif
|
||||
|
||||
std::vector<msgr_iothread_t*> iothreads;
|
||||
std::vector<int> read_ready_clients;
|
||||
std::vector<int> write_ready_clients;
|
||||
// We don't use ringloop->set_immediate here because we may have no ringloop in client :)
|
||||
std::vector<std::function<void()>> set_immediate;
|
||||
int completion_eventfd = -1;
|
||||
std::mutex completions_mu;
|
||||
std::vector<osd_op_t*> completions;
|
||||
|
||||
public:
|
||||
timerfd_manager_t *tfd;
|
||||
|
@ -188,15 +253,8 @@ protected:
|
|||
void cancel_osd_ops(osd_client_t *cl);
|
||||
void cancel_op(osd_op_t *op);
|
||||
|
||||
bool try_send(osd_client_t *cl);
|
||||
void handle_send(int result, osd_client_t *cl);
|
||||
|
||||
bool handle_read(int result, osd_client_t *cl);
|
||||
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
|
||||
bool handle_finished_read(osd_client_t *cl);
|
||||
void handle_op_hdr(osd_client_t *cl);
|
||||
bool handle_reply_hdr(osd_client_t *cl);
|
||||
void handle_reply_ready(osd_op_t *op);
|
||||
void add_completion(osd_op_t *op, ring_loop_t *ringloop = NULL);
|
||||
void handle_completions();
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
void try_send_rdma(osd_client_t *cl);
|
||||
|
|
|
@ -363,6 +363,8 @@ bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address, uint64
|
|||
auto cl = clients.at(peer_fd);
|
||||
cl->rdma_conn = rdma_conn;
|
||||
cl->peer_state = PEER_RDMA_CONNECTING;
|
||||
// Add the initial receive request
|
||||
try_recv_rdma(cl);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -603,7 +605,7 @@ void osd_messenger_t::handle_rdma_events()
|
|||
if (!is_send)
|
||||
{
|
||||
rc->cur_recv--;
|
||||
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
|
||||
if (!cl->handle_read_buffer(rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
|
||||
{
|
||||
// handle_read_buffer may stop the client
|
||||
continue;
|
||||
|
@ -666,9 +668,5 @@ void osd_messenger_t::handle_rdma_events()
|
|||
}
|
||||
}
|
||||
} while (event_count > 0);
|
||||
for (auto cb: set_immediate)
|
||||
{
|
||||
cb();
|
||||
}
|
||||
set_immediate.clear();
|
||||
handle_received_ops();
|
||||
}
|
||||
|
|
|
@ -9,53 +9,64 @@ void osd_messenger_t::read_requests()
|
|||
{
|
||||
int peer_fd = read_ready_clients[i];
|
||||
osd_client_t *cl = clients[peer_fd];
|
||||
if (cl->read_msg.msg_iovlen)
|
||||
if (!cl->try_recv(ringloop, use_sync_send_recv))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (cl->read_remaining < receive_buffer_size)
|
||||
{
|
||||
cl->read_iov.iov_base = cl->in_buf;
|
||||
cl->read_iov.iov_len = receive_buffer_size;
|
||||
cl->read_msg.msg_iov = &cl->read_iov;
|
||||
cl->read_msg.msg_iovlen = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
cl->read_iov.iov_base = 0;
|
||||
cl->read_iov.iov_len = cl->read_remaining;
|
||||
cl->read_msg.msg_iov = cl->recv_list.get_iovec();
|
||||
cl->read_msg.msg_iovlen = cl->recv_list.get_size();
|
||||
}
|
||||
cl->refs++;
|
||||
if (ringloop && !use_sync_send_recv)
|
||||
{
|
||||
io_uring_sqe* sqe = ringloop->get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
cl->read_msg.msg_iovlen = 0;
|
||||
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
|
||||
return;
|
||||
}
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
data->callback = [this, cl](ring_data_t *data) { handle_read(data->res, cl); };
|
||||
my_uring_prep_recvmsg(sqe, peer_fd, &cl->read_msg, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
int result = recvmsg(peer_fd, &cl->read_msg, 0);
|
||||
if (result < 0)
|
||||
{
|
||||
result = -errno;
|
||||
}
|
||||
handle_read(result, cl);
|
||||
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
|
||||
return;
|
||||
}
|
||||
}
|
||||
read_ready_clients.clear();
|
||||
}
|
||||
|
||||
bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
||||
bool osd_client_t::try_recv(ring_loop_t *ringloop, bool use_sync_send_recv)
|
||||
{
|
||||
auto cl = this;
|
||||
if (cl->read_msg.msg_iovlen)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
if (cl->read_remaining < cl->receive_buffer_size)
|
||||
{
|
||||
cl->read_iov.iov_base = cl->in_buf;
|
||||
cl->read_iov.iov_len = cl->receive_buffer_size;
|
||||
cl->read_msg.msg_iov = &cl->read_iov;
|
||||
cl->read_msg.msg_iovlen = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
cl->read_iov.iov_base = 0;
|
||||
cl->read_iov.iov_len = cl->read_remaining;
|
||||
cl->read_msg.msg_iov = cl->recv_list.get_iovec();
|
||||
cl->read_msg.msg_iovlen = cl->recv_list.get_size();
|
||||
}
|
||||
cl->refs++;
|
||||
if (ringloop && !use_sync_send_recv)
|
||||
{
|
||||
io_uring_sqe* sqe = ringloop->get_sqe();
|
||||
if (!sqe)
|
||||
{
|
||||
cl->read_msg.msg_iovlen = 0;
|
||||
return false;
|
||||
}
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
data->callback = [this](ring_data_t *data) { handle_read(data->res); };
|
||||
my_uring_prep_recvmsg(sqe, peer_fd, &cl->read_msg, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
int result = recvmsg(peer_fd, &cl->read_msg, 0);
|
||||
if (result < 0)
|
||||
{
|
||||
result = -errno;
|
||||
}
|
||||
handle_read(result);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool osd_client_t::handle_read(int result)
|
||||
{
|
||||
auto cl = this;
|
||||
bool ret = false;
|
||||
cl->read_msg.msg_iovlen = 0;
|
||||
cl->refs--;
|
||||
|
@ -74,7 +85,7 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
|||
{
|
||||
fprintf(stderr, "Client %d socket read error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
||||
}
|
||||
stop_client(cl->peer_fd);
|
||||
close(cl->peer_fd);
|
||||
return false;
|
||||
}
|
||||
if (result == -EAGAIN || result == -EINTR || result < cl->read_iov.iov_len)
|
||||
|
@ -91,7 +102,7 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
|||
{
|
||||
if (cl->read_iov.iov_base == cl->in_buf)
|
||||
{
|
||||
if (!handle_read_buffer(cl, cl->in_buf, result))
|
||||
if (!handle_read_buffer(cl->in_buf, result))
|
||||
{
|
||||
goto fin;
|
||||
}
|
||||
|
@ -103,7 +114,7 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
|||
cl->recv_list.eat(result);
|
||||
if (cl->recv_list.done >= cl->recv_list.count)
|
||||
{
|
||||
if (!handle_finished_read(cl))
|
||||
if (!handle_finished_read())
|
||||
{
|
||||
goto fin;
|
||||
}
|
||||
|
@ -115,16 +126,13 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
|||
}
|
||||
}
|
||||
fin:
|
||||
for (auto cb: set_immediate)
|
||||
{
|
||||
cb();
|
||||
}
|
||||
set_immediate.clear();
|
||||
msgr->handle_completions();
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain)
|
||||
bool osd_client_t::handle_read_buffer(void *curbuf, int remain)
|
||||
{
|
||||
auto cl = this;
|
||||
// Compose operation(s) from the buffer
|
||||
while (remain > 0)
|
||||
{
|
||||
|
@ -160,7 +168,7 @@ bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int rem
|
|||
}
|
||||
if (cl->recv_list.done >= cl->recv_list.count)
|
||||
{
|
||||
if (!handle_finished_read(cl))
|
||||
if (!handle_finished_read())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
@ -169,19 +177,20 @@ bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int rem
|
|||
return true;
|
||||
}
|
||||
|
||||
bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
||||
bool osd_client_t::handle_finished_read()
|
||||
{
|
||||
auto cl = this;
|
||||
cl->recv_list.reset();
|
||||
if (cl->read_state == CL_READ_HDR)
|
||||
{
|
||||
if (cl->read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
|
||||
return handle_reply_hdr(cl);
|
||||
return handle_reply_hdr();
|
||||
else if (cl->read_op->req.hdr.magic == SECONDARY_OSD_OP_MAGIC)
|
||||
handle_op_hdr(cl);
|
||||
handle_op_hdr();
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "Received garbage: magic=%jx id=%ju opcode=%jx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
|
||||
stop_client(cl->peer_fd);
|
||||
close(cl->peer_fd);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -189,14 +198,14 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
|||
{
|
||||
// Operation is ready
|
||||
cl->received_ops.push_back(cl->read_op);
|
||||
set_immediate.push_back([this, op = cl->read_op]() { exec_op(op); });
|
||||
msgr->add_completion(cl->read_op);
|
||||
cl->read_op = NULL;
|
||||
cl->read_state = 0;
|
||||
}
|
||||
else if (cl->read_state == CL_READ_REPLY_DATA)
|
||||
{
|
||||
// Reply is ready
|
||||
handle_reply_ready(cl->read_op);
|
||||
msgr->add_completion(cl->read_op);
|
||||
cl->read_op = NULL;
|
||||
cl->read_state = 0;
|
||||
}
|
||||
|
@ -207,8 +216,9 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
|||
return true;
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
||||
void osd_client_t::handle_op_hdr()
|
||||
{
|
||||
auto cl = this;
|
||||
osd_op_t *cur_op = cl->read_op;
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
|
||||
{
|
||||
|
@ -285,20 +295,21 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
|||
{
|
||||
// Operation is ready
|
||||
cl->received_ops.push_back(cur_op);
|
||||
set_immediate.push_back([this, cur_op]() { exec_op(cur_op); });
|
||||
msgr->add_completion(cur_op);
|
||||
cl->read_op = NULL;
|
||||
cl->read_state = 0;
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
||||
bool osd_client_t::handle_reply_hdr()
|
||||
{
|
||||
auto cl = this;
|
||||
auto req_it = cl->sent_ops.find(cl->read_op->req.hdr.id);
|
||||
if (req_it == cl->sent_ops.end())
|
||||
{
|
||||
// Command out of sync. Drop connection
|
||||
fprintf(stderr, "Client %d command out of sync: id %ju\n", cl->peer_fd, cl->read_op->req.hdr.id);
|
||||
stop_client(cl->peer_fd);
|
||||
close(cl->peer_fd);
|
||||
return false;
|
||||
}
|
||||
osd_op_t *op = req_it->second;
|
||||
|
@ -315,7 +326,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
|||
fprintf(stderr, "Client %d read reply of different length: expected %u+%u, got %jd+%u\n",
|
||||
cl->peer_fd, expected_size, op->bitmap_len, op->reply.hdr.retval, bmp_len);
|
||||
cl->sent_ops[op->req.hdr.id] = op;
|
||||
stop_client(cl->peer_fd);
|
||||
close(cl->peer_fd);
|
||||
return false;
|
||||
}
|
||||
if (bmp_len > 0)
|
||||
|
@ -383,7 +394,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
|||
{
|
||||
reuse:
|
||||
// It's fine to reuse cl->read_op for the next reply
|
||||
handle_reply_ready(op);
|
||||
msgr->add_completion(op);
|
||||
cl->recv_list.push_back(cl->read_op->req.buf, OSD_PACKET_SIZE);
|
||||
cl->read_remaining = OSD_PACKET_SIZE;
|
||||
cl->read_state = CL_READ_HDR;
|
||||
|
@ -391,24 +402,65 @@ reuse:
|
|||
return true;
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_reply_ready(osd_op_t *op)
|
||||
static uint64_t one = 1;
|
||||
|
||||
void osd_messenger_t::add_completion(osd_op_t *op, ring_loop_t *ringloop)
|
||||
{
|
||||
// Measure subop latency
|
||||
timespec tv_end;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
||||
stats.subop_stat_count[op->req.hdr.opcode]++;
|
||||
if (!stats.subop_stat_count[op->req.hdr.opcode])
|
||||
completions_mu.lock();
|
||||
bool wakeup_main_thread = !completions.size();
|
||||
completions.push_back(op);
|
||||
completions_mu.unlock();
|
||||
if (wakeup_main_thread)
|
||||
{
|
||||
stats.subop_stat_count[op->req.hdr.opcode]++;
|
||||
stats.subop_stat_sum[op->req.hdr.opcode] = 0;
|
||||
io_uring_sqe* sqe = ringloop ? ringloop->get_sqe() : NULL;
|
||||
if (!sqe)
|
||||
{
|
||||
write(completion_eventfd, &one, sizeof(one));
|
||||
}
|
||||
else
|
||||
{
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
data->callback = [](ring_data_t*){};
|
||||
my_uring_prep_write(sqe, completion_eventfd, &one, sizeof(one), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_completions()
|
||||
{
|
||||
// Reset eventfd counter
|
||||
uint64_t ctr = 0;
|
||||
int r = read(completion_eventfd, &ctr, 8);
|
||||
if (r < 0 && errno != EAGAIN && errno != EINTR)
|
||||
{
|
||||
fprintf(stderr, "Error resetting eventfd: %s\n", strerror(errno));
|
||||
}
|
||||
completions_mu.lock();
|
||||
auto ops = std::move(completions);
|
||||
completions_mu.unlock();
|
||||
for (auto op: completions)
|
||||
{
|
||||
if (op->op_type == OSD_OP_IN)
|
||||
{
|
||||
exec_op(op);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Measure subop latency
|
||||
timespec tv_end;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_end);
|
||||
stats.subop_stat_count[op->req.hdr.opcode]++;
|
||||
if (!stats.subop_stat_count[op->req.hdr.opcode])
|
||||
{
|
||||
stats.subop_stat_count[op->req.hdr.opcode]++;
|
||||
stats.subop_stat_sum[op->req.hdr.opcode] = 0;
|
||||
}
|
||||
stats.subop_stat_sum[op->req.hdr.opcode] += (
|
||||
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
|
||||
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
|
||||
);
|
||||
// Copy lambda to be unaffected by `delete op`
|
||||
std::function<void(osd_op_t*)>(op->callback)(op);
|
||||
}
|
||||
}
|
||||
stats.subop_stat_sum[op->req.hdr.opcode] += (
|
||||
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
|
||||
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
|
||||
);
|
||||
set_immediate.push_back([op]()
|
||||
{
|
||||
// Copy lambda to be unaffected by `delete op`
|
||||
std::function<void(osd_op_t*)>(op->callback)(op);
|
||||
});
|
||||
}
|
||||
|
|
|
@ -15,31 +15,35 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||
{
|
||||
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
|
||||
}
|
||||
else
|
||||
else if (cur_op->op_type == OSD_OP_IN)
|
||||
{
|
||||
// Check that operation actually belongs to this client
|
||||
// FIXME: Review if this is still needed
|
||||
bool found = false;
|
||||
for (auto it = cl->received_ops.begin(); it != cl->received_ops.end(); it++)
|
||||
measure_exec(cur_op);
|
||||
}
|
||||
std::unique_lock<std::mutex> lock;
|
||||
if (iothreads.size())
|
||||
{
|
||||
lock = std::unique_lock<std::mutex>(cl->mu);
|
||||
}
|
||||
// Check that operation actually belongs to this client
|
||||
bool found = false;
|
||||
for (auto it = cl->received_ops.begin(); it != cl->received_ops.end(); it++)
|
||||
{
|
||||
if (*it == cur_op)
|
||||
{
|
||||
if (*it == cur_op)
|
||||
{
|
||||
found = true;
|
||||
cl->received_ops.erase(it, it+1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
{
|
||||
delete cur_op;
|
||||
return;
|
||||
found = true;
|
||||
cl->received_ops.erase(it, it+1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
{
|
||||
delete cur_op;
|
||||
return;
|
||||
}
|
||||
auto & to_send_list = cl->write_msg.msg_iovlen ? cl->next_send_list : cl->send_list;
|
||||
auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox;
|
||||
if (cur_op->op_type == OSD_OP_IN)
|
||||
{
|
||||
measure_exec(cur_op);
|
||||
to_send_list.push_back((iovec){ .iov_base = cur_op->reply.buf, .iov_len = OSD_PACKET_SIZE });
|
||||
}
|
||||
else
|
||||
|
@ -117,12 +121,18 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
|
|||
// FIXME: It's worse because it doesn't allow batching
|
||||
while (cl->outbox.size())
|
||||
{
|
||||
try_send(cl);
|
||||
cl->try_send(NULL, true);
|
||||
}
|
||||
}
|
||||
else if (iothreads.size())
|
||||
{
|
||||
auto iot = iothreads[cl->peer_fd % iothreads.size()];
|
||||
cl->refs++;
|
||||
iot->wakeup(cl, ringloop);
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((cl->write_msg.msg_iovlen > 0 || !try_send(cl)) && (cl->write_state == 0))
|
||||
if ((cl->write_msg.msg_iovlen > 0 || !cl->try_send(ringloop, use_sync_send_recv)) && (cl->write_state == 0))
|
||||
{
|
||||
cl->write_state = CL_WRITE_READY;
|
||||
write_ready_clients.push_back(cur_op->peer_fd);
|
||||
|
@ -180,8 +190,9 @@ void osd_messenger_t::measure_exec(osd_op_t *cur_op)
|
|||
}
|
||||
}
|
||||
|
||||
bool osd_messenger_t::try_send(osd_client_t *cl)
|
||||
bool osd_client_t::try_send(ring_loop_t *ringloop, bool use_sync_send_recv)
|
||||
{
|
||||
auto cl = this;
|
||||
int peer_fd = cl->peer_fd;
|
||||
if (!cl->send_list.size() || cl->write_msg.msg_iovlen > 0)
|
||||
{
|
||||
|
@ -198,7 +209,7 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
|||
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
|
||||
cl->refs++;
|
||||
ring_data_t* data = ((ring_data_t*)sqe->user_data);
|
||||
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
|
||||
data->callback = [this](ring_data_t *data) { handle_send(data->res); };
|
||||
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
|
||||
}
|
||||
else
|
||||
|
@ -211,18 +222,27 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
|
|||
{
|
||||
result = -errno;
|
||||
}
|
||||
handle_send(result, cl);
|
||||
handle_send(result);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_messenger_t::send_replies()
|
||||
{
|
||||
if (iothreads.size())
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < write_ready_clients.size(); i++)
|
||||
{
|
||||
int peer_fd = write_ready_clients[i];
|
||||
auto cl_it = clients.find(peer_fd);
|
||||
if (cl_it != clients.end() && !try_send(cl_it->second))
|
||||
if (cl_it == clients.end())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
auto cl = cl_it->second;
|
||||
if (!cl->try_send(ringloop, use_sync_send_recv))
|
||||
{
|
||||
write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
|
||||
return;
|
||||
|
@ -231,8 +251,9 @@ void osd_messenger_t::send_replies()
|
|||
write_ready_clients.clear();
|
||||
}
|
||||
|
||||
void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
||||
void osd_client_t::handle_send(int result)
|
||||
{
|
||||
auto cl = this;
|
||||
cl->write_msg.msg_iovlen = 0;
|
||||
cl->refs--;
|
||||
if (cl->peer_state == PEER_STOPPED)
|
||||
|
@ -247,7 +268,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||
{
|
||||
// this is a client socket, so don't panic. just disconnect it
|
||||
fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
|
||||
stop_client(cl->peer_fd);
|
||||
close(cl->peer_fd);
|
||||
return;
|
||||
}
|
||||
if (result >= 0)
|
||||
|
@ -289,23 +310,11 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
|
|||
#ifdef WITH_RDMA
|
||||
if (cl->rdma_conn && !cl->outbox.size() && cl->peer_state == PEER_RDMA_CONNECTING)
|
||||
{
|
||||
// FIXME: Do something better than just forgetting the FD
|
||||
// FIXME: Ignore pings during RDMA state transition
|
||||
if (log_level > 0)
|
||||
{
|
||||
fprintf(stderr, "Successfully connected with client %d using RDMA\n", cl->peer_fd);
|
||||
}
|
||||
cl->peer_state = PEER_RDMA;
|
||||
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
|
||||
{
|
||||
// Do not miss the disconnection!
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
handle_peer_epoll(peer_fd, epoll_events);
|
||||
}
|
||||
});
|
||||
// Add the initial receive request
|
||||
try_recv_rdma(cl);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -78,7 +78,7 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
|
|||
}
|
||||
#ifndef __MOCK__
|
||||
// Then remove FD from the eventloop so we don't accidentally read something
|
||||
tfd->set_fd_handler(peer_fd, false, NULL);
|
||||
tfd->set_fd_handler(peer_fd, 0, NULL);
|
||||
if (cl->connect_timeout_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(cl->connect_timeout_id);
|
||||
|
|
|
@ -655,7 +655,7 @@ help:
|
|||
ringloop->register_consumer(&consumer);
|
||||
// Add FD to epoll
|
||||
bool stop = false;
|
||||
epmgr->tfd->set_fd_handler(sockfd[0], false, [this, &stop](int peer_fd, int epoll_events)
|
||||
epmgr->tfd->set_fd_handler(sockfd[0], EPOLLIN, [this, &stop](int peer_fd, int epoll_events)
|
||||
{
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
|
|
|
@ -185,7 +185,7 @@ void kv_cli_t::run()
|
|||
fcntl(0, F_SETFL, fcntl(0, F_GETFL, 0) | O_NONBLOCK);
|
||||
try
|
||||
{
|
||||
epmgr->tfd->set_fd_handler(0, false, [this](int fd, int events)
|
||||
epmgr->tfd->set_fd_handler(0, EPOLLIN, [this](int fd, int events)
|
||||
{
|
||||
if (events & EPOLLIN)
|
||||
{
|
||||
|
@ -193,7 +193,7 @@ void kv_cli_t::run()
|
|||
}
|
||||
if (events & EPOLLRDHUP)
|
||||
{
|
||||
epmgr->tfd->set_fd_handler(0, false, NULL);
|
||||
epmgr->tfd->set_fd_handler(0, 0, NULL);
|
||||
finished = true;
|
||||
}
|
||||
});
|
||||
|
|
|
@ -243,7 +243,7 @@ void nfs_proxy_t::run(json11::Json cfg)
|
|||
// Create NFS socket and add it to epoll
|
||||
int nfs_socket = create_and_bind_socket(bind_address, nfs_port, 128, &listening_port);
|
||||
fcntl(nfs_socket, F_SETFL, fcntl(nfs_socket, F_GETFL, 0) | O_NONBLOCK);
|
||||
epmgr->tfd->set_fd_handler(nfs_socket, false, [this](int nfs_socket, int epoll_events)
|
||||
epmgr->tfd->set_fd_handler(nfs_socket, EPOLLIN, [this](int nfs_socket, int epoll_events)
|
||||
{
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
|
@ -260,7 +260,7 @@ void nfs_proxy_t::run(json11::Json cfg)
|
|||
// Create portmap socket and add it to epoll
|
||||
int portmap_socket = create_and_bind_socket(bind_address, 111, 128, NULL);
|
||||
fcntl(portmap_socket, F_SETFL, fcntl(portmap_socket, F_GETFL, 0) | O_NONBLOCK);
|
||||
epmgr->tfd->set_fd_handler(portmap_socket, false, [this](int portmap_socket, int epoll_events)
|
||||
epmgr->tfd->set_fd_handler(portmap_socket, EPOLLIN, [this](int portmap_socket, int epoll_events)
|
||||
{
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
{
|
||||
|
@ -466,7 +466,7 @@ void nfs_proxy_t::do_accept(int listen_fd)
|
|||
{
|
||||
cli->proc_table.insert(fn);
|
||||
}
|
||||
epmgr->tfd->set_fd_handler(nfs_fd, true, [cli](int nfs_fd, int epoll_events)
|
||||
epmgr->tfd->set_fd_handler(nfs_fd, EPOLLOUT, [cli](int nfs_fd, int epoll_events)
|
||||
{
|
||||
// Handle incoming event
|
||||
if (epoll_events & EPOLLRDHUP)
|
||||
|
@ -723,7 +723,7 @@ void nfs_client_t::stop()
|
|||
stopped = true;
|
||||
if (refs <= 0)
|
||||
{
|
||||
parent->epmgr->tfd->set_fd_handler(nfs_fd, true, NULL);
|
||||
parent->epmgr->tfd->set_fd_handler(nfs_fd, 0, NULL);
|
||||
close(nfs_fd);
|
||||
delete this;
|
||||
}
|
||||
|
|
|
@ -361,7 +361,7 @@ void osd_t::bind_socket()
|
|||
listen_fd = create_and_bind_socket(bind_address, bind_port, listen_backlog, &listening_port);
|
||||
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
|
||||
|
||||
epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
|
||||
epmgr->set_fd_handler(listen_fd, EPOLLIN, [this](int fd, int events)
|
||||
{
|
||||
msgr.accept_connections(listen_fd);
|
||||
});
|
||||
|
|
|
@ -247,6 +247,7 @@ resume_8:
|
|||
finish:
|
||||
if (cur_op->peer_fd)
|
||||
{
|
||||
// FIXME: Do it before executing sync
|
||||
auto it = msgr.clients.find(cur_op->peer_fd);
|
||||
if (it != msgr.clients.end())
|
||||
it->second->dirty_pgs.clear();
|
||||
|
|
|
@ -54,14 +54,14 @@ int epoll_manager_t::get_fd()
|
|||
return epoll_fd;
|
||||
}
|
||||
|
||||
void epoll_manager_t::set_fd_handler(int fd, bool wr, std::function<void(int, int)> handler)
|
||||
void epoll_manager_t::set_fd_handler(int fd, int events, std::function<void(int, int)> handler)
|
||||
{
|
||||
if (handler != NULL)
|
||||
{
|
||||
bool exists = epoll_handlers.find(fd) != epoll_handlers.end();
|
||||
epoll_event ev;
|
||||
ev.data.fd = fd;
|
||||
ev.events = (wr ? EPOLLOUT : 0) | EPOLLIN | EPOLLRDHUP | EPOLLET;
|
||||
ev.events = events | EPOLLRDHUP | EPOLLET;
|
||||
if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0)
|
||||
{
|
||||
if (errno == ENOENT)
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <sys/epoll.h>
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "ringloop.h"
|
||||
|
@ -21,7 +23,7 @@ public:
|
|||
epoll_manager_t(ring_loop_t *ringloop);
|
||||
~epoll_manager_t();
|
||||
int get_fd();
|
||||
void set_fd_handler(int fd, bool wr, std::function<void(int, int)> handler);
|
||||
void set_fd_handler(int fd, int events, std::function<void(int, int)> handler);
|
||||
void handle_events(int timeout);
|
||||
|
||||
timerfd_manager_t *tfd;
|
||||
|
|
|
@ -32,12 +32,22 @@ static inline void my_uring_prep_readv(struct io_uring_sqe *sqe, int fd, const s
|
|||
my_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset);
|
||||
}
|
||||
|
||||
static inline void my_uring_prep_read(struct io_uring_sqe *sqe, int fd, void *buf, unsigned nbytes, off_t offset)
|
||||
{
|
||||
my_uring_prep_rw(IORING_OP_READ, sqe, fd, buf, nbytes, offset);
|
||||
}
|
||||
|
||||
static inline void my_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd, void *buf, unsigned nbytes, off_t offset, int buf_index)
|
||||
{
|
||||
my_uring_prep_rw(IORING_OP_READ_FIXED, sqe, fd, buf, nbytes, offset);
|
||||
sqe->buf_index = buf_index;
|
||||
}
|
||||
|
||||
static inline void my_uring_prep_write(struct io_uring_sqe *sqe, int fd, void *buf, unsigned nbytes, off_t offset)
|
||||
{
|
||||
my_uring_prep_rw(IORING_OP_WRITE, sqe, fd, buf, nbytes, offset);
|
||||
}
|
||||
|
||||
static inline void my_uring_prep_writev(struct io_uring_sqe *sqe, int fd, const struct iovec *iovecs, unsigned nr_vecs, off_t offset)
|
||||
{
|
||||
my_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset);
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
#include <stdexcept>
|
||||
#include "timerfd_manager.h"
|
||||
|
||||
timerfd_manager_t::timerfd_manager_t(std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler)
|
||||
timerfd_manager_t::timerfd_manager_t(std::function<void(int, int, std::function<void(int, int)>)> set_fd_handler)
|
||||
{
|
||||
this->set_fd_handler = set_fd_handler;
|
||||
wait_state = 0;
|
||||
|
@ -20,7 +20,7 @@ timerfd_manager_t::timerfd_manager_t(std::function<void(int, bool, std::function
|
|||
{
|
||||
throw std::runtime_error(std::string("timerfd_create: ") + strerror(errno));
|
||||
}
|
||||
set_fd_handler(timerfd, false, [this](int fd, int events)
|
||||
set_fd_handler(timerfd, EPOLLIN, [this](int fd, int events)
|
||||
{
|
||||
handle_readable();
|
||||
});
|
||||
|
@ -28,7 +28,7 @@ timerfd_manager_t::timerfd_manager_t(std::function<void(int, bool, std::function
|
|||
|
||||
timerfd_manager_t::~timerfd_manager_t()
|
||||
{
|
||||
set_fd_handler(timerfd, false, NULL);
|
||||
set_fd_handler(timerfd, 0, NULL);
|
||||
close(timerfd);
|
||||
}
|
||||
|
||||
|
|
|
@ -30,9 +30,9 @@ class timerfd_manager_t
|
|||
void trigger_nearest();
|
||||
void handle_readable();
|
||||
public:
|
||||
std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler;
|
||||
std::function<void(int, int, std::function<void(int, int)>)> set_fd_handler;
|
||||
|
||||
timerfd_manager_t(std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler);
|
||||
timerfd_manager_t(std::function<void(int, int, std::function<void(int, int)>)> set_fd_handler);
|
||||
~timerfd_manager_t();
|
||||
int set_timer(uint64_t millis, bool repeat, std::function<void(int)> callback);
|
||||
int set_timer_us(uint64_t micros, bool repeat, std::function<void(int)> callback);
|
||||
|
|
Loading…
Reference in New Issue