Skip to content

Commit

Permalink
Pull over updated bhtsne.py (and section of README.md) from lvdmaaten…
Browse files Browse the repository at this point in the history
… repo.
  • Loading branch information
Daniel Rapp authored and Daniel Rapp committed Mar 30, 2017
1 parent 0578101 commit 0c47fa9
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 41 deletions.
37 changes: 37 additions & 0 deletions tsne/bh_sne_src/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,40 @@ numDims = 2; pcaDims = 50; perplexity = 50; theta = .5; alg = 'svd';
map = fast_tsne(digits', numDims, pcaDims, perplexity, theta, alg);
gscatter(map(:,1), map(:,2), labels');
```

Demonstration of usage in Python:

```python
import numpy as np
import bhtsne

data = np.loadtxt("mnist2500_X.txt", skiprows=1)

embedding_array = bhtsne.run_bh_tsne(data, initial_dims=data.shape[1])
```

### Python Wrapper

Usage:

```bash
python bhtsne.py [-h] [-d NO_DIMS] [-p PERPLEXITY] [-t THETA]
[-r RANDSEED] [-n INITIAL_DIMS] [-v] [-i INPUT]
[-o OUTPUT] [--use_pca] [--no_pca] [-m MAX_ITER]
```

Below are the various options the wrapper program `bhtsne.py` expects:

- `-h, --help` show this help message and exit
- `-d NO_DIMS, --no_dims` NO_DIMS
- `-p PERPLEXITY, --perplexity` PERPLEXITY
- `-t THETA, --theta` THETA
- `-r RANDSEED, --randseed` RANDSEED
- `-n INITIAL_DIMS, --initial_dims` INITIAL_DIMS
- `-v, --verbose`
- `-i INPUT, --input` INPUT: the input file, expects a TSV with the first row as the header.
- `-o OUTPUT, --output` OUTPUT: A TSV file having each row as the `d` dimensional embedding.
- `--use_pca`
- `--no_pca`
- `-m MAX_ITER, --max_iter` MAX_ITER

84 changes: 43 additions & 41 deletions tsne/bh_sne_src/bhtsne.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,14 @@
from os import devnull
import numpy as np
import os, sys
import io

### Constants
IS_WINDOWS = True if system() == 'Windows' else False
BH_TSNE_BIN_PATH = path_join(dirname(__file__), 'windows', 'bh_tsne.exe') if IS_WINDOWS else path_join(dirname(__file__), 'bh_tsne')
assert isfile(BH_TSNE_BIN_PATH), ('Unable to find the bh_tsne binary in the '
'same directory as this script, have you forgotten to compile it?: {}'
).format(BH_TSNE_BIN_PATH)
'same directory as this script, have you forgotten to compile it?: {}'
).format(BH_TSNE_BIN_PATH)
# Default hyper-parameter values from van der Maaten (2014)
# https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf (Experimental Setup, page 13)
DEFAULT_NO_DIMS = 2
Expand All @@ -73,15 +74,15 @@ def _argparse():
argparse.add_argument('-d', '--no_dims', type=int,
default=DEFAULT_NO_DIMS)
argparse.add_argument('-p', '--perplexity', type=float,
default=DEFAULT_PERPLEXITY)
default=DEFAULT_PERPLEXITY)
# 0.0 for theta is equivalent to vanilla t-SNE
argparse.add_argument('-t', '--theta', type=float, default=DEFAULT_THETA)
argparse.add_argument('-r', '--randseed', type=int, default=EMPTY_SEED)
argparse.add_argument('-n', '--initial_dims', type=int, default=INITIAL_DIMENSIONS)
argparse.add_argument('-v', '--verbose', action='store_true')
argparse.add_argument('-i', '--input', type=FileType('r'), default=stdin)
argparse.add_argument('-o', '--output', type=FileType('w'),
default=stdout)
default=stdout)
argparse.add_argument('--use_pca', action='store_true')
argparse.add_argument('--no_pca', dest='use_pca', action='store_false')
argparse.set_defaults(use_pca=DEFAULT_USE_PCA)
Expand All @@ -92,6 +93,15 @@ def _argparse():
def _read_unpack(fmt, fh):
return unpack(fmt, fh.read(calcsize(fmt)))


def _is_filelike_object(f):
try:
return isinstance(f, (file, io.IOBase))
except NameError:
# 'file' is not a class in python3
return isinstance(f, io.IOBase)


def init_bh_tsne(samples, workdir, no_dims=DEFAULT_NO_DIMS, initial_dims=INITIAL_DIMENSIONS, perplexity=DEFAULT_PERPLEXITY,
theta=DEFAULT_THETA, randseed=EMPTY_SEED, verbose=False, use_pca=DEFAULT_USE_PCA, max_iter=DEFAULT_MAX_ITERATIONS):

Expand Down Expand Up @@ -128,41 +138,22 @@ def init_bh_tsne(samples, workdir, no_dims=DEFAULT_NO_DIMS, initial_dims=INITIAL
data_file.write(pack('i', randseed))

def load_data(input_file):
# Read the data, with some sanity checking
samples = []
for sample_line_num, sample_line in enumerate((l.rstrip('\n')
for l in input_file), start=1):
sample_data = sample_line.split(' ')
try:
assert len(sample_data) == dims, ('Input line #{} of '
'dimensionality {} although we have previously observed '
'lines with dimensionality {}, possible data error or is '
'the data sparsely encoded?'
).format(sample_line_num, len(sample_data), dims)
except NameError:
# First line, record the dimensionality
dims = len(sample_data)
try:
samples.append([float(e) for e in sample_data if e.strip() != ""])
except ValueError:
print sample_data

return np.asarray(samples, dtype='float64')

# Read the data, using numpy's good judgement
return np.loadtxt(input_file)

def bh_tsne(workdir, verbose=False):

# Call bh_tsne and let it do its thing
with open(devnull, 'w') as dev_null:
bh_tsne_p = Popen((abspath(BH_TSNE_BIN_PATH), ), cwd=workdir,
# bh_tsne is very noisy on stdout, tell it to use stderr
# if it is to print any output
stdout=stderr if verbose else dev_null)
# bh_tsne is very noisy on stdout, tell it to use stderr
# if it is to print any output
stdout=stderr if verbose else dev_null)
bh_tsne_p.wait()
assert not bh_tsne_p.returncode, ('ERROR: Call to bh_tsne exited '
'with a non-zero return code exit status, please ' +
('enable verbose mode and ' if not verbose else '') +
'refer to the bh_tsne output for further details')
'with a non-zero return code exit status, please ' +
('enable verbose mode and ' if not verbose else '') +
'refer to the bh_tsne output for further details')

# Read and pass on the results
with open(path_join(workdir, 'result.dat'), 'rb') as output_file:
Expand All @@ -171,18 +162,18 @@ def bh_tsne(workdir, verbose=False):
result_samples, result_dims = _read_unpack('ii', output_file)
# Collect the results, but they may be out of order
results = [_read_unpack('{}d'.format(result_dims), output_file)
for _ in range(result_samples)]
for _ in range(result_samples)]
# Now collect the landmark data so that we can return the data in
# the order it arrived
results = [(_read_unpack('i', output_file), e) for e in results]
# Put the results in order and yield it
results.sort()
for _, result in results:
yield result
# The last piece of data is the cost for each sample, we ignore it
#read_unpack('{}d'.format(sample_count), output_file)
# The last piece of data is the cost for each sample, we ignore it
#read_unpack('{}d'.format(sample_count), output_file)

def run_bh_tsne(data, no_dims=2, perplexity=50, theta=0.5, randseed=-1, verbose=False,initial_dims=50, use_pca=True, max_iter=1000):
def run_bh_tsne(data, no_dims=2, perplexity=50, theta=0.5, randseed=-1, verbose=False, initial_dims=50, use_pca=True, max_iter=1000):
'''
Run TSNE based on the Barnes-HT algorithm
Expand All @@ -202,18 +193,23 @@ def run_bh_tsne(data, no_dims=2, perplexity=50, theta=0.5, randseed=-1, verbose=

# bh_tsne works with fixed input and output paths, give it a temporary
# directory to work in so we don't clutter the filesystem
tmp_dir_path = "."
tmp_dir_path = mkdtemp()

# Load data in forked process to free memory for actual bh_tsne calculation
child_pid = os.fork()
if child_pid == 0:
if isinstance(data, file):
if _is_filelike_object(data):
data = load_data(data)

init_bh_tsne(data, tmp_dir_path, no_dims=no_dims, perplexity=perplexity, theta=theta, randseed=randseed,verbose=verbose, initial_dims=initial_dims, use_pca=use_pca, max_iter=max_iter)
sys.exit(0)
else:
os.waitpid(child_pid, 0)
return
try:
os.waitpid(child_pid, 0)
except KeyboardInterrupt:
print("Please run this program directly from python and not from ipython or jupyter.")
print("This is an issue due to asynchronous error handling.")

res = []
for result in bh_tsne(tmp_dir_path, verbose):
sample_res = []
Expand All @@ -225,10 +221,16 @@ def run_bh_tsne(data, no_dims=2, perplexity=50, theta=0.5, randseed=-1, verbose=


def main(args):
argp = _argparse().parse_args(args[1:])
parser = _argparse()

if len(args) <= 1:
print(parser.print_help())
return

argp = parser.parse_args(args[1:])

for result in run_bh_tsne(argp.input, no_dims=argp.no_dims, perplexity=argp.perplexity, theta=argp.theta, randseed=argp.randseed,
verbose=argp.verbose, initial_dims=argp.initial_dims, use_pca=argp.use_pca, max_iter=argp.max_iter):
verbose=argp.verbose, initial_dims=argp.initial_dims, use_pca=argp.use_pca, max_iter=argp.max_iter):
fmt = ''
for i in range(1, len(result)):
fmt = fmt + '{}\t'
Expand Down

0 comments on commit 0c47fa9

Please sign in to comment.