/usr/lib/python2.7/dist-packages/mne/cuda.py is in python-mne 0.7.3-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 | # Authors: Eric Larson <larson.eric.d@gmail.com>
#
# License: BSD (3-clause)
import numpy as np
from scipy.fftpack import fft, ifft
try:
import pycuda.gpuarray as gpuarray
from pycuda.driver import mem_get_info
from scikits.cuda import fft as cudafft
except (ImportError, OSError):
# need OSError because scikits.cuda throws it if cufft not found
pass
from .utils import sizeof_fmt, logger
# Support CUDA for FFTs; requires scikits.cuda and pycuda
cuda_capable = False
cuda_multiply_inplace_complex128 = None
cuda_halve_value_complex128 = None
cuda_real_value_complex128 = None
requires_cuda = np.testing.dec.skipif(True, 'CUDA not initialized')
def init_cuda():
"""Initialize CUDA functionality
This function attempts to load the necessary interfaces
(hardware connectivity) to run CUDA-based filering. This
function should only need to be run once per session.
If the config var (set via mne.set_config or in ENV)
MNE_USE_CUDA == 'true', this function will be executed when
importing mne. If this variable is not set, this function can
be manually executed.
"""
global cuda_capable
global cuda_multiply_inplace_complex128
global cuda_halve_value_complex128
global cuda_real_value_complex128
global requires_cuda
if cuda_capable is True:
logger.info('CUDA previously enabled, currently %s available memory'
% sizeof_fmt(mem_get_info()[0]))
return
# Triage possible errors for informative messaging
cuda_capable = False
try:
import pycuda.gpuarray
import pycuda.driver
except ImportError:
logger.warn('module pycuda not found, CUDA not enabled')
else:
try:
# Initialize CUDA; happens with importing autoinit
import pycuda.autoinit
except ImportError:
logger.warn('pycuda.autoinit could not be imported, likely '
'a hardware error, CUDA not enabled')
else:
# Make our multiply inplace kernel
try:
from pycuda.elementwise import ElementwiseKernel
# let's construct our own CUDA multiply in-place function
dtype = 'pycuda::complex<double>'
cuda_multiply_inplace_complex128 = \
ElementwiseKernel(dtype + ' *a, ' + dtype + ' *b',
'b[i] *= a[i]', 'multiply_inplace')
cuda_halve_value_complex128 = \
ElementwiseKernel(dtype + ' *a', 'a[i] /= 2.0',
'halve_value')
cuda_real_value_complex128 = \
ElementwiseKernel(dtype + ' *a', 'a[i] = real(a[i])',
'real_value')
except:
# This should never happen
raise RuntimeError('pycuda ElementwiseKernel could not be '
'constructed, please report this issue '
'to mne-python developers with your '
'system information and pycuda version')
else:
# Make sure scikits.cuda is installed
try:
from scikits.cuda import fft as cudafft
except ImportError:
logger.warn('module scikits.cuda not found, CUDA not '
'enabled')
else:
# Make sure we can use 64-bit FFTs
try:
fft_plan = cudafft.Plan(16, np.float64, np.complex128)
del fft_plan
except:
logger.warn('Device does not support 64-bit FFTs, '
'CUDA not enabled')
else:
cuda_capable = True
# Figure out limit for CUDA FFT calculations
logger.info('Enabling CUDA with %s available memory'
% sizeof_fmt(mem_get_info()[0]))
requires_cuda = np.testing.dec.skipif(not cuda_capable,
'CUDA not initialized')
###############################################################################
# Repeated FFT multiplication
def setup_cuda_fft_multiply_repeated(n_jobs, h_fft):
"""Set up repeated CUDA FFT multiplication with a given filter
Parameters
----------
n_jobs : int | str
If n_jobs == 'cuda', the function will attempt to set up for CUDA
FFT multiplication.
h_fft : array
The filtering function that will be used repeatedly.
If n_jobs='cuda', this function will be shortened (since CUDA
assumes FFTs of real signals are half the length of the signal)
and turned into a gpuarray.
Returns
-------
n_jobs : int
Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise
original n_jobs is passed.
cuda_dict : dict
Dictionary with the following CUDA-related variables:
use_cuda : bool
Whether CUDA should be used.
fft_plan : instance of FFTPlan
FFT plan to use in calculating the FFT.
ifft_plan : instance of FFTPlan
FFT plan to use in calculating the IFFT.
x_fft : instance of gpuarray
Empty allocated GPU space for storing the result of the
frequency-domain multiplication.
x : instance of gpuarray
Empty allocated GPU space for the data to filter.
h_fft : array | instance of gpuarray
This will either be a gpuarray (if CUDA enabled) or np.ndarray.
If CUDA is enabled, h_fft will be modified appropriately for use
with filter.fft_multiply().
Notes
-----
This function is designed to be used with fft_multiply_repeated().
"""
cuda_dict = dict(use_cuda=False, fft_plan=None, ifft_plan=None,
x_fft=None, x=None, fft_len=None)
n_fft = len(h_fft)
if n_jobs == 'cuda':
n_jobs = 1
if cuda_capable:
# set up all arrays necessary for CUDA
cuda_fft_len = int((n_fft - (n_fft % 2)) / 2 + 1)
use_cuda = False
# try setting up for float64
try:
fft_plan = cudafft.Plan(n_fft, np.float64, np.complex128)
ifft_plan = cudafft.Plan(n_fft, np.complex128, np.float64)
x_fft = gpuarray.empty(cuda_fft_len, np.complex128)
x = gpuarray.empty(int(n_fft), np.float64)
cuda_h_fft = h_fft[:cuda_fft_len].astype('complex128')
# do the IFFT normalization now so we don't have to later
cuda_h_fft /= len(h_fft)
h_fft = gpuarray.to_gpu(cuda_h_fft)
dtype = np.float64
multiply_inplace = cuda_multiply_inplace_complex128
except:
logger.info('CUDA not used, could not instantiate memory '
'(arrays may be too large), falling back to '
'n_jobs=1')
else:
use_cuda = True
if use_cuda is True:
logger.info('Using CUDA for FFT FIR filtering')
cuda_dict['use_cuda'] = True
cuda_dict['fft_plan'] = fft_plan
cuda_dict['ifft_plan'] = ifft_plan
cuda_dict['x_fft'] = x_fft
cuda_dict['x'] = x
cuda_dict['dtype'] = dtype
cuda_dict['multiply_inplace'] = multiply_inplace
else:
logger.info('CUDA not used, CUDA has not been initialized, '
'falling back to n_jobs=1')
return n_jobs, cuda_dict, h_fft
def fft_multiply_repeated(h_fft, x, cuda_dict=dict(use_cuda=False)):
"""Do FFT multiplication by a filter function (possibly using CUDA)
Parameters
----------
h_fft : 1-d array or gpuarray
The filtering array to apply.
x : 1-d array
The array to filter.
cuda_dict : dict
Dictionary constructed using setup_cuda_multiply_repeated().
Returns
-------
x : 1-d array
Filtered version of x.
"""
if not cuda_dict['use_cuda']:
# do the fourier-domain operations
x = np.real(ifft(h_fft * fft(x), overwrite_x=True)).ravel()
else:
# do the fourier-domain operations, results in second param
cuda_dict['x'].set(x.astype(cuda_dict['dtype']))
cudafft.fft(cuda_dict['x'], cuda_dict['x_fft'], cuda_dict['fft_plan'])
cuda_dict['multiply_inplace'](h_fft, cuda_dict['x_fft'])
# If we wanted to do it locally instead of using our own kernel:
# cuda_seg_fft.set(cuda_seg_fft.get() * h_fft)
cudafft.ifft(cuda_dict['x_fft'], cuda_dict['x'],
cuda_dict['ifft_plan'], False)
x = np.array(cuda_dict['x'].get(), dtype=x.dtype, subok=True,
copy=False)
return x
###############################################################################
# FFT Resampling
def setup_cuda_fft_resample(n_jobs, W, new_len):
"""Set up CUDA FFT resampling
Parameters
----------
n_jobs : int | str
If n_jobs == 'cuda', the function will attempt to set up for CUDA
FFT resampling.
W : array
The filtering function to be used during resampling.
If n_jobs='cuda', this function will be shortened (since CUDA
assumes FFTs of real signals are half the length of the signal)
and turned into a gpuarray.
new_len : int
The size of the array following resampling.
Returns
-------
n_jobs : int
Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise
original n_jobs is passed.
cuda_dict : dict
Dictionary with the following CUDA-related variables:
use_cuda : bool
Whether CUDA should be used.
fft_plan : instance of FFTPlan
FFT plan to use in calculating the FFT.
ifft_plan : instance of FFTPlan
FFT plan to use in calculating the IFFT.
x_fft : instance of gpuarray
Empty allocated GPU space for storing the result of the
frequency-domain multiplication.
x : instance of gpuarray
Empty allocated GPU space for the data to resample.
W : array | instance of gpuarray
This will either be a gpuarray (if CUDA enabled) or np.ndarray.
If CUDA is enabled, W will be modified appropriately for use
with filter.fft_multiply().
Notes
-----
This function is designed to be used with fft_resample().
"""
cuda_dict = dict(use_cuda=False, fft_plan=None, ifft_plan=None,
x_fft=None, x=None, y_fft=None, y=None)
if n_jobs == 'cuda':
n_jobs = 1
if cuda_capable:
use_cuda = False
# try setting up for float64
try:
n_fft_x = len(W)
cuda_fft_len_x = int((n_fft_x - (n_fft_x % 2)) / 2 + 1)
n_fft_y = new_len
cuda_fft_len_y = int((n_fft_y - (n_fft_y % 2)) / 2 + 1)
fft_plan = cudafft.Plan(n_fft_x, np.float64, np.complex128)
ifft_plan = cudafft.Plan(n_fft_y, np.complex128, np.float64)
x_fft = gpuarray.zeros(max(cuda_fft_len_x,
cuda_fft_len_y), np.complex128)
x = gpuarray.empty(max(int(n_fft_x),
int(n_fft_y)), np.float64)
cuda_W = W[:cuda_fft_len_x].astype('complex128')
# do the IFFT normalization now so we don't have to later
cuda_W /= n_fft_y
W = gpuarray.to_gpu(cuda_W)
dtype = np.float64
multiply_inplace = cuda_multiply_inplace_complex128
except:
logger.info('CUDA not used, could not instantiate memory '
'(arrays may be too large), falling back to '
'n_jobs=1')
else:
use_cuda = True
if use_cuda is True:
logger.info('Using CUDA for FFT FIR filtering')
cuda_dict['use_cuda'] = True
cuda_dict['fft_plan'] = fft_plan
cuda_dict['ifft_plan'] = ifft_plan
cuda_dict['x_fft'] = x_fft
cuda_dict['x'] = x
cuda_dict['dtype'] = dtype
cuda_dict['multiply_inplace'] = multiply_inplace
cuda_dict['halve_value'] = cuda_halve_value_complex128
cuda_dict['real_value'] = cuda_real_value_complex128
else:
logger.info('CUDA not used, CUDA has not been initialized, '
'falling back to n_jobs=1')
return n_jobs, cuda_dict, W
def fft_resample(x, W, new_len, npad, to_remove,
cuda_dict=dict(use_cuda=False)):
"""Do FFT resampling with a filter function (possibly using CUDA)
Parameters
----------
x : 1-d array
The array to resample.
W : 1-d array or gpuarray
The filtering function to apply.
new_len : int
The size of the output array (before removing padding).
npad : int
Amount of padding to apply before resampling.
to_remove : int
Number of samples to remove after resampling.
cuda_dict : dict
Dictionary constructed using setup_cuda_multiply_repeated().
Returns
-------
x : 1-d array
Filtered version of x.
"""
# add some padding at beginning and end to make this work a little cleaner
x = _smart_pad(x, npad)
old_len = len(x)
if not cuda_dict['use_cuda']:
N = int(min(new_len, old_len))
sl_1 = slice((N + 1) / 2)
y_fft = np.zeros(new_len, np.complex128)
x_fft = fft(x).ravel()
x_fft *= W
y_fft[sl_1] = x_fft[sl_1]
sl_2 = slice(-(N - 1) / 2, None)
y_fft[sl_2] = x_fft[sl_2]
y = np.real(ifft(y_fft, overwrite_x=True)).ravel()
else:
if old_len < new_len:
x = np.concatenate((x, np.zeros(new_len - old_len, x.dtype)))
cuda_dict['x'].set(x)
# do the fourier-domain operations, results put in second param
cudafft.fft(cuda_dict['x'], cuda_dict['x_fft'], cuda_dict['fft_plan'])
cuda_dict['multiply_inplace'](W, cuda_dict['x_fft'])
# This is not straightforward, but because x_fft and y_fft share
# the same data (and only one half of the full DFT is stored), we
# don't have to transfer the slice like we do in scipy. All we
# need to worry about is the Nyquist component, either halving it
# or taking just the real component...
if new_len > old_len:
if old_len % 2 == 0:
nyq = int((old_len - (old_len % 2)) / 2)
cuda_dict['halve_value'](cuda_dict['x_fft'],
slice=slice(nyq, nyq + 1))
else:
if new_len % 2 == 0:
nyq = int((new_len - (new_len % 2)) / 2)
cuda_dict['real_value'](cuda_dict['x_fft'],
slice=slice(nyq, nyq + 1))
cudafft.ifft(cuda_dict['x_fft'], cuda_dict['x'],
cuda_dict['ifft_plan'], scale=False)
y = cuda_dict['x'].get()
if new_len < old_len:
y = y[:new_len].copy()
# now let's trim it back to the correct size (if there was padding)
if to_remove > 0:
keep = np.ones((new_len), dtype='bool')
keep[:to_remove] = False
keep[-to_remove:] = False
y = np.compress(keep, y)
return y
###############################################################################
# Misc
# this has to go in mne.cuda instead of mne.filter to avoid import errors
def _smart_pad(x, n_pad):
"""Pad vector x
"""
# need to pad with zeros if len(x) <= npad
z_pad = np.zeros(max(n_pad - len(x) + 1, 0), dtype=x.dtype)
return np.r_[z_pad, 2 * x[0] - x[n_pad:0:-1], x,
2 * x[-1] - x[-2:-n_pad - 2:-1], z_pad]
|