def trainModel(docMatrix, savemodel, k, iterations=10, parallelization=16): data = mmread(docMatrix) rowRange = sc.parallelize(xrange(data.shape[0]), parallelization) dataSpark = spark.createDataFrame(rowRange .map(lambda i: Row(label=i, features=sparkToScipySparse(data.getrow(i))))) lda = LDA(k=k, maxIter=iterations) model = lda.fit(dataSpark) model.save(savemodel) topicMatrix = model.topicsMatrix().toArray() topicMatrix = topicMatrix.T topicMatrix = topicMatrix / topicMatrix.sum(axis=0) print 'TODO: give wordXtopic.mtx a path' mmwrite('wordXtopic.mtx', topicMatrix) print 'TODO: give docXtopic.mtx a path' docXTopics = model.transform(dataSpark) dxT = docXTopics.collect() dxT_v2 = np.array([ dxtI['topicDistribution'] for dxtI in dxT ]) mmwrite('docXtopic.mtx', dxT_v2) # Main script
def get_data(setname): dataset = CorporaDataSet(setname) # topic_word_array = dataset.getWordsInTopicMatrix() # topic_doc_array = dataset.getDocsInTopicMatrix() topic_word_array = dataset.getDocsInTopicMatrix() topic_doc_array = dataset.getWordsInTopicMatrix().T doc_length_array = numpy.full([topic_doc_array.shape[0]],1) vocabulary = dataset.loadVocabulary()[0].keys() print "topic word array shape: ",topic_word_array.shape print "topic doc shape: ",topic_doc_array.shape print "vocabulary: ",len(vocabulary) wordfreqs = mmread(setname + ".mtx").sum(1) word_freq_array = numpy.array(wordfreqs)[:,0] return {topic_word_key:topic_word_array, topic_doc_key:topic_doc_array, doc_length_key:doc_length_array, vocabulary_key:vocabulary, word_freq_key:word_freq_array}
def load_mtx(genome_dir): barcodes_tsv = os.path.join(genome_dir, "barcodes.tsv") genes_tsv = os.path.join(genome_dir, "genes.tsv") matrix_mtx = os.path.join(genome_dir, "matrix.mtx") for filepath in [barcodes_tsv, genes_tsv, matrix_mtx]: if not os.path.exists(filepath): raise IOError("Required file not found: %s" % filepath) barcodes = pd.read_csv(barcodes_tsv, delimiter='\t', header=None, usecols=[0]).values.squeeze() genes = pd.read_csv(genes_tsv, delimiter='\t', header=None, usecols=[0]).values.squeeze() genes = [cr_constants.Gene(gene_id, None, None, None, None) for gene_id in genes] matrix = sp_io.mmread(matrix_mtx) gbm = GeneBCMatrix(genes, barcodes) gbm.m = matrix return gbm
def getWordsInTopicMatrix(self): wxt = mmread(self.setname + '_wordXtopic.mtx') return wxt
def getDocsInTopicMatrix(self): dxt = mmread(self.setname + '_docXtopic.mtx') return dxt.T
def load_sparse_matrix(filepath): return csc_matrix(mmread(filepath))
def load_vectors(filename, is_sparse=True): if is_sparse: return io.mmread(filename) else: return np.loadtxt(filename, delimiter=",")
def import_training_data(self, word_vector_file, train_label_file): self.training_data.delete_many({}) self.training_data.create_index('training_num') word_vector = io.mmread(word_vector_file) vector = np.array(word_vector.todense()) with open(train_label_file, 'r') as f: label = json.load(f) num = len(label) for i in range(num): dic = {} dic['training_num'] = i dic['vector'] = list(vector[i]) dic['label'] = int(label[i]) self.training_data.insert_one(dic)
def _read_mtx(filename, return_dict=True, dtype='float32'): """Read mtx file. """ from scipy.io import mmread # could be rewritten accounting for dtype to be more performant X = mmread(filename).astype(dtype) from scipy.sparse import csr_matrix X = csr_matrix(X) logg.m('... did not find row_names or col_names') if return_dict: return {'X': X} else: return AnnData(X)
def load_mm(mm_file): adj = mmread(mm_file) assert adj.shape[0] == adj.shape[1] # Initialize graph g = gt.Graph(directed=False) edge_weight = g.edge_properties["weight"] = g.new_edge_property("double") # Create vertex for every row/column g.add_vertex(adj.shape[0]) print('[graph_io] Reading matrix market file with {0} explicit elements...'.format(len(adj.data))) # Loop over all explicit elements in the sparse matrix for iteration, (i, j, w) in enumerate(zip(adj.row, adj.col, adj.data)): # Skip self-edges. if i == j: continue # Add edge to the graph, if its 'symmetric partner' is not already there. # (Undirected graph, so g.edge(i, j) == g.edge(j, i)) if g.edge(i, j) is None: g.add_edge(i, j) edge_weight[i, j] = w # Print progress every 5% if iteration % (int(0.05 * len(adj.data))) == 0: perc = 100 * iteration / len(adj.data) print('[graph_io] {0:.1f}%'.format(perc), end='\r') print('\n[graph_io] Done!') return g # Read a csv file, and construct an undirected weighted graph from it.
def read_mtx(filename, dtype='float32'): """Read `.mtx` file. Returns ------- An :class:`~anndata.AnnData` object. """ from scipy.io import mmread # could be rewritten accounting for dtype to be more performant X = mmread(filename).astype(dtype) from scipy.sparse import csr_matrix X = csr_matrix(X) return AnnData(X)
def create_from_cellranger(indir: str, outdir: str = None, genome: str = None) -> None: """ Create a .loom file from 10X Genomics cellranger output Args: indir (str): path to the cellranger output folder (the one that contains 'outs') outdir (str): output folder wher the new loom file should be saved (default to indir) genome (str): genome build to load (e.g. 'mm10'; if None, determine species from outs folder) Returns: LoomConnection to created loom file. """ if outdir is None: outdir = indir sampleid = os.path.split(os.path.abspath(indir))[-1] matrix_folder = os.path.join(indir, 'outs', 'filtered_gene_bc_matrices') if genome is None: genome = [f for f in os.listdir(matrix_folder) if not f.startswith(".")][0] matrix_folder = os.path.join(matrix_folder, genome) matrix = mmread(os.path.join(matrix_folder, "matrix.mtx")).astype("float32").todense() with open(os.path.join(matrix_folder, "genes.tsv"), "r") as f: lines = f.readlines() accession = np.array([x.split("\t")[0] for x in lines]).astype("str") gene = np.array([x.split("\t")[1].strip() for x in lines]).astype("str") with open(os.path.join(matrix_folder, "barcodes.tsv"), "r") as f: lines = f.readlines() cellids = np.array([sampleid + ":" + x.strip() for x in lines]).astype("str") col_attrs = {"CellID": cellids} row_attrs = {"Accession": accession, "Gene": gene} tsne_file = os.path.join(indir, "outs", "analysis", "tsne", "projection.csv") # In cellranger V2 the file moved one level deeper if not os.path.exists(tsne_file): tsne_file = os.path.join(indir, "outs", "analysis", "tsne", "2_components", "projection.csv") if os.path.exists(tsne_file): tsne = np.loadtxt(tsne_file, usecols=(1, 2), delimiter=',', skiprows=1) col_attrs["X"] = tsne[:, 0].astype('float32') col_attrs["Y"] = tsne[:, 1].astype('float32') clusters_file = os.path.join(indir, "outs", "analysis", "clustering", "graphclust", "clusters.csv") if os.path.exists(clusters_file): labels = np.loadtxt(clusters_file, usecols=(1, ), delimiter=',', skiprows=1) col_attrs["ClusterID"] = labels.astype('int') - 1 create(os.path.join(outdir, sampleid + ".loom"), matrix, row_attrs, col_attrs, file_attrs={"Genome": genome})