1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566
//! Provides the functionality for memory management across devices.
//!
//! A Tensor is a potentially multi-dimensional matrix containing information about the actual data and its structure.
//! A Coaster Tensor tracks the memory copies of the numeric data of an Tensor across the devices of the Backend
//! and manages
//!
//! * the location of these memory copies
//! * the location of the latest memory copy and
//! * the synchronisation of memory copies between devices
//!
//! This is important, as this provides a unified data interface for exectuing Tensor operations on CUDA, OpenCL and
//! common host CPU.
//!
//! A [memory copy][mem] represents one logical unit of data, which might be located at the host. The
//! Tensor, tracks the location of the data blob across the various devices that the backend might
//! consist of. This allows us to run operations on various backends with the same data blob.
//!
//! ## Terminology
//!
//! A Tensor is a homogeneous multi-dimensional array - a table of elements (usually numeric elements) of the same type,
//! indexed by tuples of positive integers. In Coaster, `dimensions` of a Tensor describe the axis for a
//! coordinate system. The numbers of dimensions is known as the `rank`. A scalar value like `3` has the rank 0, and a Rust array
//! like `[1, 2, 3]` has a rank of 1 as it has one dimension. A array of arrays like `[[1, 2, 3], [2, 3]]` has a rank
//! of 2 as it has two dimensions. The number of elements for a dimension is called `length`.
//! And the number of all elements for each dimension summed up is the `size`. These meta data about a Tensor is called
//! the `descriptor` of the Tensor.
//!
//! [frameworks]: ../frameworks/index.html
//! [mem]: ../memory/index.html
//! ## Examples
//!
//! Create a SharedTensor and fill it with some numbers:
//!
//! ```
//! # extern crate coaster;
//! use coaster::framework::IFramework;
//! use coaster::frameworks::Native;
//! use coaster::tensor::SharedTensor;
//! # fn main() {
//! // allocate memory
//! let native = Native::new();
//! let device = native.new_device(native.hardwares()).unwrap();
//! let shared_data = &mut SharedTensor::<i32>::new(&5);
//! // fill memory with some numbers
//! let mut mem = shared_data.write_only(&device).unwrap();
//! mem.as_mut_slice::<i32>().clone_from_slice(&[0, 1, 2, 3, 4]);
//! # }
//! ```
use crate::device::Error as DeviceError;
use crate::device::{IDevice, MemorySync};
use std::any::Any;
use std::cell::{Cell, RefCell};
use std::marker::PhantomData;
use std::ops::Deref;
use std::{fmt, mem};
/// Describes the Descriptor of a SharedTensor.
pub type TensorDesc = Vec<usize>;
/// BitMap type for keeping track of up-to-date locations. If number of
/// locations provided by the integer isn't enough, this type can be easily
/// replaced with BitSet at cost of a heap allocation and extra inderection
/// on access.
type BitMap = u64;
/// Number of bits in `BitMap`. It's currently no possible to get this
/// information from `BitMap` cleanly. Though there are plans to add a
/// static method or associated constant.
const BIT_MAP_SIZE: usize = 64;
struct TensorLocation {
// TODO: both .device and .mem_transfer contain the same device object.
// `device` acts as recipient for memory transfers, and `mem_transfer`
// acts as initiator. It would be nice to use `Box<Any + MemorySync>,
// but that requires boxing two vtable pointers and isn't currently
// possible (E0225). Using `Box<Device>` is impossible too, as this requires
// specifying associated type `Device::M`.
// It may be possible to manually store device in a box and keep two fat
// pointers to its contents, but it's not obvious how to erase type of
// the box to store it uniformly.
device: Box<dyn Any>,
mem_transfer: Box<dyn MemorySync>,
mem: Box<dyn Any>,
}
/// Container that handles synchronization of [Memory][1] of type `T`.
/// [1]: ../memory/index.html
pub struct SharedTensor<T> {
desc: TensorDesc,
locations: RefCell<Vec<TensorLocation>>,
up_to_date: Cell<BitMap>,
phantom: PhantomData<T>,
}
/// Describes the Descriptor of a Tensor.
pub trait ITensorDesc {
/// Returns the rank of the Tensor.
///
/// The rank of the Tensor is the number of its dimensions.
fn rank(&self) -> usize;
/// Returns the summed up length of all dimensions of the Tensor.
///
/// A Tensor of rank 2 with the following dimesion specification [5, 5] would have a size of 25.
fn size(&self) -> usize;
/// Returns the dimensions of the Tensor.
///
/// To return the length of one dimensions of the Tensor, you would call
/// tensor_desc.dims()[0] // e.g. 64
fn dims(&self) -> &Vec<usize>;
/// Returns the dimensions of the Tensor as Vec<i32>.
fn dims_i32(&self) -> Vec<i32>;
/// Returns the default stride for an Rust allocated Tensor.
///
/// A rank 2 Tensor with dimensions [a, b] has a default stride of [b, 1]
/// A rank 3 Tensor with dimensions [a, b, c] has a default stride of [b * c, c, 1]
/// A rank 4 Tensor with dimensions [a, b, c, d] has a default stride of [b * c * d, c * d, d, 1]
/// and so on.
fn default_stride(&self) -> Vec<usize> {
let mut strides: Vec<usize> = Vec::with_capacity(self.rank());
let dim_length = self.dims().len();
match dim_length {
0 => strides,
1 => {
strides.push(1);
strides
}
_ => {
let imp_dims = &self.dims()[1..dim_length];
for (i, _) in imp_dims.iter().enumerate() {
strides.push(imp_dims[i..imp_dims.len()].iter().product())
}
strides.push(1);
strides
}
}
}
/// Returns the default stride for a Rust allocated Tensor as i32.
fn default_stride_i32(&self) -> Vec<i32> {
self.default_stride().iter().map(|&e| e as i32).collect()
}
}
/// Describes a conversion into a Tensor Descriptor.
///
/// This allows for convenient creation of a new SharedTensor.
/// e.g. (2, 4) -> [2,4] or () -> [] or 2 -> [2]
pub trait IntoTensorDesc {
/// Converts the implemented type into a TensorDesc.
fn into(&self) -> TensorDesc;
}
impl IntoTensorDesc for () {
fn into(&self) -> TensorDesc {
Vec::with_capacity(1)
}
}
impl IntoTensorDesc for usize {
fn into(&self) -> TensorDesc {
vec![*self]
}
}
impl IntoTensorDesc for u32 {
fn into(&self) -> TensorDesc {
vec![*self as usize]
}
}
impl IntoTensorDesc for isize {
fn into(&self) -> TensorDesc {
vec![*self as usize]
}
}
impl IntoTensorDesc for i32 {
fn into(&self) -> TensorDesc {
vec![*self as usize]
}
}
impl IntoTensorDesc for Vec<usize> {
fn into(&self) -> TensorDesc {
self.clone()
}
}
impl<'a> IntoTensorDesc for &'a [usize] {
fn into(&self) -> TensorDesc {
From::from(self.to_owned())
}
}
impl IntoTensorDesc for (usize, usize) {
fn into(&self) -> TensorDesc {
vec![self.0, self.1]
}
}
impl IntoTensorDesc for (usize, usize, usize) {
fn into(&self) -> TensorDesc {
vec![self.0, self.1, self.2]
}
}
impl IntoTensorDesc for (usize, usize, usize, usize) {
fn into(&self) -> TensorDesc {
vec![self.0, self.1, self.2, self.3]
}
}
impl IntoTensorDesc for (usize, usize, usize, usize, usize) {
fn into(&self) -> TensorDesc {
vec![self.0, self.1, self.2, self.3, self.4]
}
}
impl IntoTensorDesc for (usize, usize, usize, usize, usize, usize) {
fn into(&self) -> TensorDesc {
vec![self.0, self.1, self.2, self.3, self.4, self.5]
}
}
macro_rules! impl_array_into_tensor_desc {
($($N:expr)+) => {
$(
impl IntoTensorDesc for [usize; $N] {
fn into(&self) -> TensorDesc {
let slice: &[_] = self;
From::from(slice)
}
}
)+
}
}
impl_array_into_tensor_desc!(1 2 3 4 5 6);
impl ITensorDesc for TensorDesc {
fn rank(&self) -> usize {
self.len()
}
fn size(&self) -> usize {
match self.rank() {
0 => 1,
_ => self.iter().product(),
}
}
fn dims(&self) -> &Vec<usize> {
self
}
fn dims_i32(&self) -> Vec<i32> {
self.iter().map(|&e| e as i32).collect()
}
}
impl<T> fmt::Debug for SharedTensor<T> {
fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result {
write!(f, "SharedTensor desc={:?}", self.desc)
}
}
impl<T> SharedTensor<T> {
/// Create new Tensor by allocating [Memory][1] on a Device.
/// [1]: ../memory/index.html
pub fn new<D: IntoTensorDesc>(desc: &D) -> SharedTensor<T> {
SharedTensor {
desc: desc.into(),
locations: RefCell::new(Vec::new()),
up_to_date: Cell::new(0),
phantom: PhantomData,
}
}
/// Change the shape of the Tensor.
///
/// Will return an Error if size of new shape is not equal to the old shape.
/// If you want to change the shape to one of a different size, use `resize`.
pub fn reshape<D: IntoTensorDesc>(&mut self, desc: &D) -> Result<(), Error> {
let new_desc: TensorDesc = desc.into();
if new_desc.size() == self.desc().size() {
self.desc = new_desc;
Ok(())
} else {
Err(Error::InvalidShape(
"Size of the provided shape is not equal to the old shape.",
))
}
}
/// Change the size and shape of the Tensor.
///
/// **Caution**: Drops all copies which are not on the current device.
///
/// 'reshape' is preffered over this method if the size of the old and new shape
/// are identical because it will not reallocate memory.
pub fn resize<D: IntoTensorDesc>(&mut self, desc: &D) -> Result<(), Error> {
self.locations.borrow_mut().clear();
self.up_to_date.set(0);
self.desc = desc.into();
Ok(())
}
fn get_location_index<D: IDevice>(&self, device: &D) -> Option<usize> {
for (i, loc) in self.locations.borrow().iter().enumerate() {
match loc.device.deref().downcast_ref::<D>() {
Some(ref d) if *d == device => return Some(i),
_ => {}
}
}
None
}
/// Looks up `device` in self.locations and returns its index. If lookup
/// fails then new location is created and its index is returned.
fn get_or_create_location_index<D: IDevice>(&self, device: &D) -> Result<usize, Error> {
if let Some(i) = self.get_location_index(device) {
return Ok(i);
}
if self.locations.borrow().len() == BIT_MAP_SIZE {
return Err(Error::CapacityExceeded);
}
let bytes_n = Self::mem_size(self.desc().size());
self.locations.borrow_mut().push(TensorLocation {
device: Box::new(device.clone()),
mem_transfer: Box::new(device.clone()),
mem: Box::new(D::alloc_memory(device, bytes_n)?),
});
Ok(self.locations.borrow().len() - 1)
}
// TODO: chose the best source to copy data from.
// That would require some additional traits that return costs for
// transferring data between different backends.
// Actually I think that there would be only transfers between
// `Native` <-> `Cuda` and `Native` <-> `OpenCL` in foreseeable future,
// so it's best to not overengineer here.
fn sync_if_needed(&self, dst_i: usize) -> Result<(), Error> {
if self.up_to_date.get() & (1 << dst_i) != 0 {
return Ok(());
}
let src_i = self.up_to_date.get().trailing_zeros() as usize;
assert!(src_i != BIT_MAP_SIZE);
// We need to borrow two different Vec elements: src and mut dst.
// Borrowck doesn't allow to do it in a straightforward way, so
// here is workaround.
assert!(src_i != dst_i);
let mut locs = self.locations.borrow_mut();
let (src_loc, dst_loc) = if src_i < dst_i {
let (left, right) = locs.split_at_mut(dst_i);
(&left[src_i], &mut right[0])
} else {
let (left, right) = locs.split_at_mut(src_i);
(&right[0], &mut left[dst_i])
};
// Backends may define transfers asymmetrically. E. g. CUDA may know how
// to transfer to and from Native backend, while Native may know nothing
// about CUDA at all. So if first attempt fails we change order and
// try again.
match src_loc.mem_transfer.sync_out(
src_loc.mem.deref(),
dst_loc.device.deref(),
dst_loc.mem.as_mut(),
) {
Err(DeviceError::NoMemorySyncRoute) => {}
x => return x.map_err(|e| e.into()),
}
match dst_loc.mem_transfer.sync_in(
dst_loc.mem.as_mut(),
src_loc.device.deref(),
src_loc.mem.deref(),
) {
Err(DeviceError::NoMemorySyncRoute) => {}
x => return x.map_err(|e| e.into()),
}
// If there is no direct path, we take the detour via native
// and do an indirect transfer.
if cfg!(feature = "native") {
use crate::framework::IFramework;
use crate::frameworks::native::Native;
let native_framework = Native::new();
let native_device = native_framework
.new_device(native_framework.hardwares())
.unwrap(); // FIXME
let mut native_mem = native_device.alloc_memory(self.desc.size()).unwrap(); // FIXME calculate size
match src_loc.mem_transfer.sync_out(
src_loc.mem.deref(),
&native_device,
&mut native_mem,
) {
Err(DeviceError::NoMemorySyncRoute) => {}
x => return x.map_err(|e| e.into()),
}
match dst_loc
.mem_transfer
.sync_in(dst_loc.mem.as_mut(), &native_device, &native_mem)
{
Err(DeviceError::NoMemorySyncRoute) => {}
x => return x.map_err(|e| e.into()),
}
Ok(())
} else {
Err(DeviceError::NoMemorySyncRoute.into())
}
}
// Functions `read()`, `read_write()`, `write_only()` use `unsafe` to
// extend lifetime of retured reference to internally owned memory chunk.
// Borrowck guarantees that SharedTensor outlives all of its Tensors, and
// there is only one mutable borrow. So we only need to make sure that
// memory locations won't be dropped or moved while there are live Tensors.
// It's quite easy to do: by convention we only allow to remove elements from
// `self.locations` in methods with `&mut self`. Since we store device's memory
// objects in a Box, reference to it won't change during Vec reallocations.
/// Get memory for reading on the specified `device`.
/// Can fail if memory allocation fails, or if tensor wasn't initialized yet.
pub fn read<'a, D: IDevice>(&'a self, device: &D) -> Result<&'a D::M, Error> {
if self.up_to_date.get() == 0 {
return Err(Error::UninitializedMemory);
}
let i = self.get_or_create_location_index(device)?;
self.sync_if_needed(i)?;
self.up_to_date.set(self.up_to_date.get() | (1 << i));
let locs = self.locations.borrow();
let mem: &D::M = &locs[i]
.mem
.deref()
.downcast_ref()
.expect("Broken invariant: wrong memory type");
let mem_a: &'a D::M = unsafe { ::std::mem::transmute(mem) };
Ok(mem_a)
}
/// Get memory for reading and writing on the specified `device`.
/// Can fail if memory allocation fails, or if tensor wasn't initialized yet.
pub fn read_write<'a, D: IDevice>(&'a mut self, device: &D) -> Result<&'a mut D::M, Error> {
if self.up_to_date.get() == 0 {
return Err(Error::UninitializedMemory);
}
let i = self.get_or_create_location_index(device)?;
self.sync_if_needed(i)?;
self.up_to_date.set(1 << i);
let mut locs = self.locations.borrow_mut();
let mem: &mut D::M = &mut locs[i]
.mem
.as_mut()
.downcast_mut()
.expect("Broken invariant: wrong memory type");
let mem_a: &'a mut D::M = unsafe { ::std::mem::transmute(mem) };
Ok(mem_a)
}
/// Get memory for writing only.
/// This function skips synchronization and initialization checks, since
/// contents will be overwritten anyway. By convention caller must fully
/// initialize returned memory. Failure to do so may result in use of
/// uninitialized data later. If caller has failed to overwrite memory,
/// for some reason, it must call `invalidate()` to return vector to
/// uninitialized state.
pub fn write_only<'a, D: IDevice>(&'a mut self, device: &D) -> Result<&'a mut D::M, Error> {
let i = self.get_or_create_location_index(device)?;
self.up_to_date.set(1 << i);
let mut locs = self.locations.borrow_mut();
let mem: &mut D::M = &mut locs[i]
.mem
.as_mut()
.downcast_mut()
.expect("Broken invariant: wrong memory type");
let mem_a: &'a mut D::M = unsafe { ::std::mem::transmute(mem) };
Ok(mem_a)
}
// FIXME: synchronize memory elsewhere if possible?
/// Drops memory allocation on the specified device. Returns error if
/// no memory has been allocated on this device.
pub fn drop<D: IDevice>(&mut self, device: &D) -> Result<(), Error> {
match self.get_location_index(device) {
Some(i) => {
self.locations.borrow_mut().remove(i);
let up_to_date = self.up_to_date.get();
let mask = (1 << i) - 1;
let lower = up_to_date & mask;
let upper = (up_to_date >> 1) & (!mask);
self.up_to_date.set(lower | upper);
Ok(())
}
None => Err(Error::InvalidRemove(
"Memory isn't allocated on this device",
)),
}
}
// force synchronize initialized memory to a device
/// Allocates an already filled memory block on a device.
/// This is a special needs function for performance concerns and should be avoided where possible.
fn sync<D: IDevice>(&mut self, device: &D) -> Result<(), Error> {
if self.up_to_date.get() == 0 {
return Err(Error::UninitializedMemory);
}
let i = self.get_or_create_location_index(device)?;
self.sync_if_needed(i)?;
self.up_to_date.set(1 << i);
Ok(())
}
// TODO move = sync + drop
/// Returns the number of elements for which the Tensor has been allocated.
pub fn capacity(&self) -> usize {
self.desc.size()
}
/// Returns the descriptor of the Tensor.
pub fn desc(&self) -> &TensorDesc {
&self.desc
}
/// Returns the allocated Memory size in bytes.
pub fn mem_size(capacity: usize) -> usize {
mem::size_of::<T>() * capacity
}
}
/// Errors than can occur when synchronizing memory.
#[derive(Debug, Copy, Clone, PartialEq, Eq, thiserror::Error)]
pub enum Error {
/// Error caused by operations with device: allocation, memory synchronization, etc.
#[error(transparent)]
DeviceError(#[from] DeviceError),
/// Unable to remove Memory copy from SharedTensor.
#[error("Invalid remove: {0}")]
InvalidRemove(&'static str),
/// Shape provided for reshaping is not compatible with old shape.
#[error("Invalid shape: {0}")]
InvalidShape(&'static str),
/// Maximal number of backing memories has been reached.
#[error("Capacity exceeded")]
CapacityExceeded,
/// Memory is requested for reading, but it hasn't been initialized.
#[error("Attempt to read uninitialized memory")]
UninitializedMemory,
}