/usr/share/pyshared/sklearn/datasets/mlcomp.py is in python-sklearn 0.14.1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | # Copyright (c) 2010 Olivier Grisel <olivier.grisel@ensta.org>
# License: BSD 3 clause
"""Glue code to load http://mlcomp.org data as a scikit.learn dataset"""
import os
import numbers
from sklearn.datasets.base import load_files
def _load_document_classification(dataset_path, metadata, set_=None, **kwargs):
if set_ is not None:
dataset_path = os.path.join(dataset_path, set_)
return load_files(dataset_path, metadata.get('description'), **kwargs)
LOADERS = {
'DocumentClassification': _load_document_classification,
# TODO: implement the remaining domain formats
}
def load_mlcomp(name_or_id, set_="raw", mlcomp_root=None, **kwargs):
"""Load a datasets as downloaded from http://mlcomp.org
Parameters
----------
name_or_id : the integer id or the string name metadata of the MLComp
dataset to load
`set_` : select the portion to load: 'train', 'test' or 'raw'
mlcomp_root : the filesystem path to the root folder where MLComp datasets
are stored, if mlcomp_root is None, the MLCOMP_DATASETS_HOME
environment variable is looked up instead.
**kwargs : domain specific kwargs to be passed to the dataset loader.
Returns
-------
data : Bunch
Dictionary-like object, the interesting attributes are:
'filenames', the files holding the raw to learn, 'target', the
classification labels (integer index), 'target_names',
the meaning of the labels, and 'DESCR', the full description of the
dataset.
Note on the lookup process: depending on the type of name_or_id,
will choose between integer id lookup or metadata name lookup by
looking at the unzipped archives and metadata file.
TODO: implement zip dataset loading too
"""
if mlcomp_root is None:
try:
mlcomp_root = os.environ['MLCOMP_DATASETS_HOME']
except KeyError:
raise ValueError("MLCOMP_DATASETS_HOME env variable is undefined")
mlcomp_root = os.path.expanduser(mlcomp_root)
mlcomp_root = os.path.abspath(mlcomp_root)
mlcomp_root = os.path.normpath(mlcomp_root)
if not os.path.exists(mlcomp_root):
raise ValueError("Could not find folder: " + mlcomp_root)
# dataset lookup
if isinstance(name_or_id, numbers.Integral):
# id lookup
dataset_path = os.path.join(mlcomp_root, str(name_or_id))
else:
# assume name based lookup
dataset_path = None
expected_name_line = "name: " + name_or_id
for dataset in os.listdir(mlcomp_root):
metadata_file = os.path.join(mlcomp_root, dataset, 'metadata')
if not os.path.exists(metadata_file):
continue
for line in file(metadata_file):
if line.strip() == expected_name_line:
dataset_path = os.path.join(mlcomp_root, dataset)
break
if dataset_path is None:
raise ValueError("Could not find dataset with metadata line: " +
expected_name_line)
# loading the dataset metadata
metadata = dict()
metadata_file = os.path.join(dataset_path, 'metadata')
if not os.path.exists(metadata_file):
raise ValueError(dataset_path + ' is not a valid MLComp dataset')
for line in file(metadata_file):
if ":" in line:
key, value = line.split(":", 1)
metadata[key.strip()] = value.strip()
format = metadata.get('format', 'unknow')
loader = LOADERS.get(format)
if loader is None:
raise ValueError("No loader implemented for format: " + format)
return loader(dataset_path, metadata, set_=set_, **kwargs)
|