我们从Python开源项目中,提取了以下40个代码示例,用于说明如何使用six.moves.reduce()。
def crt(ak, nk): from six.moves import reduce """ Chinese-Reminders-Theorem Implementation using Gauss's proof and generalization on gcd(n1, n2) != 1 Should be len(ak) == len(nk) Original: https://gist.github.com/elliptic-shiho/901d223135965308a5f9ff0cf99dd7c8 Explanation: http://elliptic-shiho.hatenablog.com/entry/2016/04/03/020117 Args: ak: A Numbers [a1, a2, ..., ak] nk: A Modulus [n1, n2, ..., nk] """ assert len(ak) == len(nk) N = reduce(lambda x, y: x * y, nk, 1) l = lcm(*nk) s = 0 for n, a in zip(nk, ak): m = N // n g, x, y = egcd(m, n) s += (m // g) * x * a s %= l return s
def get(self, key, recursive=False, sorted=False, quorum=False, wait=False, wait_index=None, timeout=None): key_chunks = split_key(key) if not wait: # Get immediately. try: node = reduce(MockNode.get_node, key_chunks, self.root) except KeyError: raise KeyNotFound(index=self.index) return self.make_result(Got, node, remember=False, sorted=sorted) # Wait... if wait_index is not None: indices = self.indices.get(key_chunks, ()) x = bisect.bisect_left(indices, (wait_index, False)) for index, exact in indices[x:]: if recursive or exact: # Matched past result found. return self.history[index] # Register an event and wait... event_key = (recursive, key_chunks) event = self.events.setdefault(event_key, threading.Event()) if not event.wait(timeout): raise TimedOut index, __ = self.indices[key_chunks][-1] return self.history[index]
def get_common_course_modes(course_runs): """ Fake implementation returning common course modes. Arguments: course_run_ids(Iterable[str]): Target Course run IDs. Returns: set: course modes found in all given course runs """ course_run_modes = [ set(seat.get("type") for seat in course_run.get("seats")) for course_run in FAKE_COURSE_RUNS_RESPONSE if course_run.get("key") in course_runs ] return six_reduce(lambda left, right: left & right, course_run_modes)
def _bitwise_filter_op(op, *filter_sets): output_set = filter_sets[0].copy() if op=='not': assert len(filter_sets)==1 for k in output_set.keys(): output_set[k] = _bitwise_not(filter_sets[0][k]) elif op in ('and', 'or'): for k in output_set.keys(): output_set[k] = reduce(_bitwise_and if op=='and' else _bitwise_or, [fs[k] for fs in filter_sets]) elif op=='andcascade': for k in output_set.keys(): output_set[k] = reduce(_bitwise_andcascade, [fs[k] for fs in filter_sets[::-1]]) else: raise AssertionError('op should be one of {}'.format(('and', 'or', 'andcascade', 'not'))) return output_set
def mul_calculate(num, denum, aslist=False, out_type=None): if not num and not denum: # Smallest 1 possible. if aslist: return [] else: return numpy.int8(1) # Make sure we do not accidently upcast data types. if out_type is None: out_dtype = scalar.upcast(*[v.dtype for v in (num + denum)]) else: out_dtype = out_type.dtype one = theano._asarray(1, dtype=out_dtype) v = reduce(numpy.multiply, num, one) / reduce(numpy.multiply, denum, one) if aslist: if numpy.all(v == 1): return [] else: return [v] return v
def build_gemm_call(self): return reduce(str.__add__, ( self.declare_NS, self.check_xyz_rank2, self.setup_z_Nz_Sz, self.check_xyz_double_or_float, self.check_ab_double_or_float, self.check_dims, self.check_strides, self.encode_strides_in_unit, self.compute_strides, self.begin_switch_typenum, self.case_float, self.case_float_ab_constants, self.case_float_gemm, self.case_double, self.case_double_ab_constants, self.case_double_gemm, self.end_switch_typenum), '')
def values_eq_approx_high_tol(a, b): """ This fct is needed to don't have DebugMode raise useless error due to ronding error. This happen as We reduce on the two last dimensions, so this can raise the absolute error if the number of element we reduce on is significant. """ assert a.ndim == 4 atol = None if a.shape[-1] * a.shape[-2] > 100: # For float32 the default atol is 1e-5 atol = 3e-5 return CudaNdarrayType.values_eq_approx(a, b, atol=atol)
def build_vocab(self, data, candidates): vocab = reduce(lambda x, y: x | y, (set( list(chain.from_iterable(s)) + q) for s, q, a in data)) vocab |= reduce(lambda x, y: x | y, (set(candidate) for candidate in candidates)) vocab = sorted(vocab) self.word_idx = dict((c, i + 1) for i, c in enumerate(vocab)) max_story_size = max(map(len, (s for s, _, _ in data))) mean_story_size = int(np.mean([len(s) for s, _, _ in data])) self.sentence_size = max( map(len, chain.from_iterable(s for s, _, _ in data))) self.candidate_sentence_size = max(map(len, candidates)) query_size = max(map(len, (q for _, q, _ in data))) self.memory_size = min(self.memory_size, max_story_size) self.vocab_size = len(self.word_idx) + 1 # +1 for nil word self.sentence_size = max( query_size, self.sentence_size) # for the position # params print("vocab size:", self.vocab_size) print("Longest sentence length", self.sentence_size) print("Longest candidate sentence length", self.candidate_sentence_size) print("Longest story length", max_story_size) print("Average story length", mean_story_size)
def discriminator(self, name, image): X = image / 255.0 if name in self.reuse.keys(): reuse = self.reuse[name] else: self.reuse[name] = True reuse = False with tf.variable_scope(name, reuse=reuse) as scope: X = self.make_conv('conv1', X, shape=[4,4,3,128], strides=[1,2,2,1]) X = self.leakyRelu(X, 0.2) # print(name, X.get_shape()) X = self.make_conv_bn('conv2', X, shape=[4,4,128,256], strides=[1,2,2,1]) X = self.leakyRelu(X, 0.2) # print(name, X.get_shape()) X = self.make_conv_bn('conv3', X, shape=[4,4,256,512], strides=[1,2,2,1]) X = self.leakyRelu(X, 0.2) # print(name, X.get_shape()) X = self.make_conv_bn('conv4', X, shape=[4,4,512,512], strides=[1,2,2,1]) X = self.leakyRelu(X, 0.2) # print(name, X.get_shape()) flat_dim = reduce(lambda x,y: x*y, X.get_shape().as_list()[1:]) X = tf.reshape(X, [-1, flat_dim]) X = self.make_fc('fct', X, 1) # X = tf.nn.sigmoid(X) return X
def html_to_ssml(text): """ Replaces specific html tags with probable SSML counterparts. """ ssml_text = reduce(lambda x, y: x.replace(y, html_to_ssml_maps[y]), html_to_ssml_maps, text) return ssml_text
def get_realtime_quotes(code_list, open_only=False): import tushare as ts max_len = 800 loop_cnt = int(math.ceil(float(len(code_list)) / max_len)) total_df = reduce(lambda df1, df2: df1.append(df2), [ts.get_realtime_quotes([code for code in code_list[i::loop_cnt]]) for i in range(loop_cnt)]) total_df["is_index"] = False index_symbol = ["sh", "sz", "hs300", "sz50", "zxb", "cyb"] index_df = ts.get_realtime_quotes(index_symbol) index_df["code"] = index_symbol index_df["is_index"] = True total_df = total_df.append(index_df) total_df = total_df.set_index("code").sort_index() columns = set(total_df.columns) - set(["name", "time", "date"]) # columns = filter(lambda x: "_v" not in x, columns) for label in columns: total_df[label] = total_df[label].map(lambda x: 0 if str(x).strip() == "" else x) total_df[label] = total_df[label].astype(float) total_df["chg"] = total_df["price"] / total_df["pre_close"] - 1 total_df["order_book_id"] = total_df.index total_df["order_book_id"] = total_df["order_book_id"].apply(tushare_code_2_order_book_id) total_df["datetime"] = total_df["date"] + " " + total_df["time"] total_df["datetime"] = total_df["datetime"].apply(lambda x: convert_dt_to_int(datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))) total_df["close"] = total_df["price"] if open_only: total_df = total_df[total_df.open > 0] return total_df
def gcd_multiple(*a): from six.moves import reduce """ Apply gcd to some variables. Args: a: args list """ return reduce(gcd, a)
def lcm(*a): from six.moves import reduce """ Calculate Least Common Multiple Args: *a: args list """ return reduce(op.mul, a) // gcd_multiple(*a)
def get_train_test(which_task='data/tasks_1-20_v1-2/en/',task_num=1): train, val, test = load_task(which_task,task_num) data = train + test + val vocab = sorted(reduce(lambda x, y: x | y, (set(list(chain.from_iterable(s)) + q + a) for s, q, a in data))) word_idx = dict((c, i + 1) for i, c in enumerate(vocab)) max_story_size = max(map(len, (s for s, _, _ in data))) mean_story_size = int(np.mean([ len(s) for s, _, _ in data ])) sentence_size = max(map(len, chain.from_iterable(s for s, _, _ in data))) query_size = max(map(len, (q for _, q, _ in data))) if (task_num==3): max_story_size = min(130, max_story_size) else: max_story_size = min(70, max_story_size) vocab_size = len(word_idx) +1# +1 for nil word sentence_size = max(query_size, sentence_size) # for the position sentence_size+=1 logging.info("Longest sentence length: "+ str( sentence_size)) logging.info("Longest story length: "+ str( max_story_size)) logging.info("Average story length: "+ str( mean_story_size)) logging.info("Training sample: "+ str(len(train))) logging.info("Validation sample: "+ str(len(val))) logging.info("Test sample: "+ str(len(test))) logging.info("Vocab size : "+ str(vocab_size)) S, Q, A = vectorize_data(train, word_idx, sentence_size, max_story_size) valS, valQ, valA = vectorize_data(val, word_idx, sentence_size, max_story_size) testS, testQ, testA = vectorize_data(test, word_idx, sentence_size, max_story_size) return {'train':{'S':S, 'Q':np.expand_dims(Q, axis=1), 'A':A}, 'val':{'S':valS, 'Q':np.expand_dims(valQ, axis=1), 'A':valA}, 'test':{'S':testS, 'Q':np.expand_dims(testQ, axis=1), 'A':testA}, 'vocab':vocab, 'vocab_size':vocab_size, 'sent_len':sentence_size, 'sent_numb':max_story_size, 'word_idx':word_idx, 'len_training':len(train)}
def transform_by_chrom(all_epo, from_elem_list, tree, chrom, opt, out_fd): BED4_FRM = "%s\t%d\t%d\t%s\n" BED12_FRM = "%s\t%d\t%d\t%s\t1000\t+\t%d\t%d\t0,0,0\t%d\t%s\t%s\n" assert len( set(from_elem_list['chrom']) ) <= 1 mapped_elem_count = 0 for from_elem in from_elem_list: matching_block_ids = [attrgetter("value")(_) for _ in tree.find(chrom, from_elem['start'], from_elem['end'])] # do the actual mapping to_elem_slices = [_ for _ in (transform(from_elem, all_epo[i], opt.gap) for i in matching_block_ids) if _] if len(to_elem_slices) > 1 or len(to_elem_slices) == 0: log.debug("%s no match or in different chain/chromosomes" % (str(from_elem))) continue to_elem_slices = to_elem_slices[0] # apply threshold if (from_elem[2] - from_elem[1]) * opt.threshold > reduce(lambda b,a: a[2]-a[1] + b, to_elem_slices, 0): log.debug("%s did not pass threshold" % (str(from_elem))) continue # if to_species had insertions you can join elements to_elem_list = sorted(union_elements(to_elem_slices), key=lambda a: a[1]) if to_elem_list: mapped_elem_count += 1 log.debug("\tjoined to %d elements" % (len(to_elem_list))) if opt.format == "BED4": map(lambda tel: out_fd.write(BED4_FRM % tel), to_elem_list) else: start = to_elem_list[0][1] end = to_elem_list[-1][2] out_fd.write(BED12_FRM % (to_elem_list[0][0], start, end, from_elem['id'], start, end, len(to_elem_list), ",".join( "%d" % (e[2]-e[1]) for e in to_elem_list ), ",".join( "%d" % (e[1]-start) for e in to_elem_list ) ) ) log.info("%s %d of %d elements mapped" % (chrom, mapped_elem_count, from_elem_list.shape[0]))
def _zero_mantissa(dval): """Determine whether the mantissa bits of the given double are all zero.""" bb = _double_as_bytes(dval) return ((bb[1] & 0x0f) | reduce(operator.or_, bb[2:])) == 0 ## ## Functions to test for IEEE 754 special values ##
def cdbhash( s ): return reduce( lambda h, c: (((h << 5) + h) ^ ord(c)) & 0xffffffff, s, 5381 )
def last(seq): return reduce(lambda l, r: r, seq)
def lookup_phrase(self, phrase, normalize_vec=False): """Looks up the given phrase in this embedding""" phrase = self.get_normalized(phrase) vectors = [] split = phrase.split('_') if self.supports_phrases: i = 0 while i < len(split): # Yes, this is O(n^2). Since phrases are all short in # outlier detection, this is still tractable, but it # could be better. best_match = [(l, x) for (l, x) in phrase_gen(split[i:]) if self.in_vocabulary(x)] if best_match: i += best_match[-1][0] vectors.append(np.asarray(self.get_vector(best_match[-1][1]))) else: i += 1 else: for word in (w for w in split if self.in_vocabulary(w)): vectors.append(np.asarray(self.get_vector(word))) if len(vectors) == 0: #print("OOV: " + phrase + " (split: " + str(split) + ")") return None else: summed = reduce(operator.add, vectors) average = summed / len(vectors) if normalize_vec: averaged = average / np.linalg.norm(average) return average
def run(self): collected = [] for log_item_config in self._config[self.LOGS]: log_item = LogItem(log_item_config, self) matches = [load_parser(c[CProp.TYPE])(c, log_item).run() for c in log_item.get_config(CProp.PARSERS)] collected = chain(collected, reduce(chain, matches)) return reduce(merge_matchobj_to_dict, collected, {'groups': (), 'groupdict': {}})
def set(self, key, value=None, dir=False, ttl=None, refresh=False, prev_value=None, prev_index=None, prev_exist=None, timeout=None): if refresh: prev_exist = True if value is not None: raise RefreshValue(index=self.index) elif ttl is None: raise RefreshTTLRequired(index=self.index) expiration = ttl and (datetime.utcnow() + timedelta(ttl)) key_chunks = split_key(key) index = self.next_index() should_test = prev_value is not None or prev_index is not None parent_node = reduce(MockNode.get_node, key_chunks[:-1], self.root) try: node = parent_node.get_node(key_chunks[-1]) except KeyError: if prev_exist or should_test: raise KeyNotFound(index=self.index) node = MockNode(key, index, value, dir, ttl, expiration) parent_node.add_node(node) else: if prev_exist is not None and not prev_exist: raise NodeExist(index=self.index) if refresh: if node.dir: raise NotFile(index=self.index) value = node.value self.compare(node, prev_value, prev_index) node.set(index, value, dir, ttl, expiration) if refresh: result_class = ComparedThenSwapped if should_test else Set notify = False else: result_class = Updated if prev_exist or should_test else Set notify = True return self.make_result(result_class, node, key_chunks=key_chunks, notify=notify)
def append(self, key, value=None, dir=False, ttl=None, timeout=None): expiration = ttl and (datetime.utcnow() + timedelta(ttl)) key_chunks = split_key(key) parent_node = reduce(MockNode.get_node, key_chunks, self.root) for x in itertools.count(len(parent_node.nodes)): item_key = '%020d' % x if not parent_node.has_node(item_key): break key = os.path.join(key, item_key) index = self.next_index() node = MockNode(key, index, value, dir, ttl, expiration) parent_node.add_node(node) return self.make_result(Created, node, key_chunks=key_chunks)
def delete(self, key, dir=False, recursive=False, prev_value=None, prev_index=None, timeout=None): key_chunks = split_key(key) parent_node = reduce(MockNode.get_node, key_chunks[:-1], self.root) try: node = parent_node.get_node(key_chunks[-1]) except KeyError: raise KeyNotFound(index=self.index) self.compare(node, prev_value, prev_index) parent_node.pop_node(key_chunks[-1]) return self.make_result(Deleted, prev_node=node, key_chunks=key_chunks)
def reverse_complement(seq): """ Return reverse complement of a string (base) sequence. :param seq: Input sequence. :returns: Reverse complement of input sequence. :rtype: str """ if len(seq) == 0: return seq return reduce(lambda x, y: x + y, map(base_complement, seq[::-1]))
def test_not_lazy_if_inplace(self): # Tests that if the outputs are scalars and the graph is big, # we disable the inplace opt to speed up optimization x = tensor.vector('x', dtype=self.dtype) y = tensor.vector('y', dtype=self.dtype) c = tensor.iscalar('c') mode = theano.compile.get_mode(self.mode).excluding( # Disable many opt to keep the graph big enough to disable # the opt. 'fusion', 'local_add_canonizer', 'inplace', 'constant_folding', 'constant_folding') y2 = reduce(lambda x, y: x + y, [y] + list(range(200))) f = theano.function([c, x, y], ifelse(c, x, y2), mode=mode) # For not inplace ifelse ifnode = [n for n in f.maker.fgraph.toposort() if isinstance(n.op, IfElse)] assert len(ifnode) == 1 assert not ifnode[0].op.as_view rng = numpy.random.RandomState(utt.fetch_seed()) xlen = rng.randint(200) ylen = rng.randint(200) vx = numpy.asarray(rng.uniform(size=(xlen,)), self.dtype) vy = numpy.asarray(rng.uniform(size=(ylen,)), self.dtype) assert numpy.allclose(vx, f(1, vx, vy)) assert numpy.allclose(vy + sum(range(200)), f(0, vx, vy))
def local_useless_reduce(node): """Sum(a, axis=[]) -> a """ if isinstance(node.op, T.CAReduce): summed, = node.inputs # if reduce were doing anything, the output ndim would be reduced if summed.type == node.outputs[0].type: return [summed] # Enabling this optimization at canonicalization step break this test: # theano/tensor/tests/test_opt.py:T_local_reduce.test_local_reduce_broadcast_some_0 # see gh-790 issue. # # @register_canonicalize
def add_calculate(num, denum, aslist=False, out_type=None): # TODO: make sure that this function and mul_calculate are similar if out_type is None: zero = 0.0 else: zero = theano._asarray(0, dtype=out_type.dtype) # zero = 0.0 if out_type is None else theano._asarray(0, # dtype=out_type.dtype) v = reduce(numpy.add, num, zero) - reduce(numpy.add, denum, zero) if aslist: if numpy.all(v == 0): return [] else: return [v] return v
def check_chain(r, *chain): """ WRITEME """ if isinstance(r, graph.Apply): r = r.outputs[0] return _check_chain(r, reduce(list.__iadd__, ([x, 0] for x in chain)))
def local_gpu_split(node): if isinstance(node.op, tensor.Split): input = node.inputs[0] outs_clients = reduce(list.__add__, [out.clients for out in node.outputs]) if (input.owner and isinstance(input.owner.op, HostFromGpu) or any(c != 'output' and isinstance(c.op, GpuFromHost) for c, idx in outs_clients)): new_op = GpuSplit(**node.op._props_dict()) split_res = new_op(as_cuda_ndarray_variable(input), *node.inputs[1:], return_list=True) return [host_from_gpu(o) for o in split_res] return False
def c_support_code_apply(self, node, nodename): # REMEMBER TO RAISE c_code_cache_version when changing any of # these files files = ['corr_gemm.cu'] codes = [open(os.path.join(os.path.split(__file__)[0], f)).read() for f in files] return reduce(str.__add__, codes)
def c_support_code_apply(self, node, nodename): # REMEMBER TO RAISE c_code_cache_version when changing any of # these files files = ['corr3d_gemm.cu'] codes = [open(os.path.join(os.path.split(__file__)[0], f)).read() for f in files] return reduce(str.__add__, codes)
def build_vocab(self,data,candidates,save=False,load=False): if load: vocab_file = open('vocab.obj', 'rb') vocab = pickle.load(vocab_file) else: vocab = reduce(lambda x, y: x | y, (set(list(chain.from_iterable(s)) + q) for s, q, a in data)) vocab |= reduce(lambda x,y: x|y, (set(candidate) for candidate in candidates) ) vocab=sorted(vocab) self.word_idx = dict((c, i + 1) for i, c in enumerate(vocab)) max_story_size = max(map(len, (s for s, _, _ in data))) mean_story_size = int(np.mean([ len(s) for s, _, _ in data ])) self.sentence_size = max(map(len, chain.from_iterable(s for s, _, _ in data))) self.candidate_sentence_size=max(map(len,candidates)) query_size = max(map(len, (q for _, q, _ in data))) self.memory_size = min(self.memory_size, max_story_size) self.vocab_size = len(self.word_idx) + 1 # +1 for nil word self.sentence_size = max(query_size, self.sentence_size) # for the position # params print("vocab size:",self.vocab_size) print("Longest sentence length", self.sentence_size) print("Longest candidate sentence length", self.candidate_sentence_size) print("Longest story length", max_story_size) print("Average story length", mean_story_size) if save: vocab_file = open('vocab.obj', 'wb') pickle.dump(vocab, vocab_file)
def evaluate(self, previousResult, original=None, special=None): if special not in {None, "transform", "agg"}: raise Exception("Special must be one of None, 'transform', or 'agg'.") original = original if original is not None else previousResult if (original._grouped_self and special and not isinstance(self._queue[0], FunctionStep)): name = GetName(self) # TODO: Rewrite this, this is terrible. go_to_index = len(self._queue) for idx, item in enumerate(self._queue): if isinstance(item, OperatorStep): go_to_index = idx break transform_input = lambda x: reduce( lambda prevResult, f: f.evaluate(prevResult, original, special=special), self._queue[1:go_to_index], x ) if special == "transform": out = original._grouped_self[name].transform(transform_input) elif special == "agg": out = original._grouped_self[name].agg(transform_input) out = reduce(lambda prevResult, f: f.evaluate(prevResult, original, special=special), self._queue[go_to_index:], out) return out else: output = reduce(lambda prevResult, f: f.evaluate(prevResult, original), self._queue, original) return output
def reduce(func, vec): """ Python2 and Python3 compatible reduce @params: `func`: The reduce function `vec`: The list to be reduced @returns: The reduced value """ return moves.reduce(func, vec)
def get_realtime_quotes(code_list, open_only=False): import tushare as ts max_len = 800 loop_cnt = int(math.ceil(float(len(code_list)) / max_len)) total_df = reduce(lambda df1, df2: df1.append(df2), [ts.get_realtime_quotes([code for code in code_list[i::loop_cnt]]) for i in range(loop_cnt)]) total_df["is_index"] = False index_symbol = ["sh", "sz", "hs300", "sz50", "zxb", "cyb"] index_df = ts.get_realtime_quotes(index_symbol) index_df["code"] = index_symbol index_df["is_index"] = True total_df = total_df.append(index_df) columns = set(total_df.columns) - set(["name", "time", "date", "code"]) # columns = filter(lambda x: "_v" not in x, columns) for label in columns: total_df[label] = total_df[label].map(lambda x: 0 if str(x).strip() == "" else x) total_df[label] = total_df[label].astype(float) total_df["chg"] = total_df["price"] / total_df["pre_close"] - 1 total_df["order_book_id"] = total_df["code"] total_df["order_book_id"] = total_df["order_book_id"].apply(tushare_code_2_order_book_id) total_df = total_df.set_index("order_book_id").sort_index() total_df["datetime"] = total_df["date"] + " " + total_df["time"] total_df["datetime"] = total_df["datetime"].apply(lambda x: convert_dt_to_int(datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))) total_df["close"] = total_df["price"] total_df["last"] = total_df["price"] total_df["limit_up"] = total_df.apply(lambda row: row.pre_close * (1.1 if "ST" not in row["name"] else 1.05), axis=1).round(2) total_df["limit_down"] = total_df.apply(lambda row: row.pre_close * (0.9 if "ST" not in row["name"] else 0.95), axis=1).round(2) if open_only: total_df = total_df[total_df.open > 0] return total_df
def get_train_test(which_task='data/tasks_1-20_v1-2/en/',task_num=1): train, val, test = load_task(which_task,task_num) data = train + test + val vocab = sorted(reduce(lambda x, y: x | y, (set(list(chain.from_iterable(s)) + q + a) for s, q, a in data))) word_idx = dict((c, i + 1) for i, c in enumerate(vocab)) max_story_size = max(map(len, (s for s, _, _ in data))) mean_story_size = int(np.mean([ len(s) for s, _, _ in data ])) sentence_size = max(map(len, chain.from_iterable(s for s, _, _ in data))) query_size = max(map(len, (q for _, q, _ in data))) if (task_num==3): max_story_size = min(130, max_story_size) else: max_story_size = min(70, max_story_size) vocab_size = len(word_idx) +1# +1 for nil word sentence_size = max(query_size, sentence_size) # for the position sentence_size+=1 print("Longest sentence length", sentence_size) print("Longest story length", max_story_size) print("Average story length", mean_story_size) print("Training sample",len(train)) print("Validation sample",len(val)) print("Test sample",len(test)) print("Vocab size ",vocab_size) # embeddings_mat = get_emb_matrix(vocab_size,word_idx,embed_size = embedding_size ,emb_file='data/glove.6B.{}d.txt'.format(embedding_size)) # embeddings_mat = pickle.load( open( "emb_task1.p", "rb" ) ) # train/validation/test sets S, Q, A = vectorize_data(train, word_idx, sentence_size, max_story_size) valS, valQ, valA = vectorize_data(val, word_idx, sentence_size, max_story_size) testS, testQ, testA = vectorize_data(test, word_idx, sentence_size, max_story_size) return {'train':{'S':S, 'Q':Q, 'A':A}, 'val':{'S':valS, 'Q':valQ, 'A':valA}, 'test':{'S':testS, 'Q':testQ, 'A':testA}, 'vocab':vocab, 'vocab_size':vocab_size, 'sent_len':sentence_size, 'sent_numb':max_story_size, 'word_idx':word_idx, 'len_training':len(train)}
def stats_from_aligned_read(read, with_clipps=False): """Create summary information for an aligned read (modified from tang.util.bio). :param read: :class:`pysam.AlignedSegment` object :param with_clipps: """ tags = dict(read.tags) try: tags.get('NM') except: raise IOError( "Read is missing required 'NM' tag. Try running 'samtools fillmd -S - ref.fa'.") name = read.qname if read.flag == 4: return None match = reduce(lambda x, y: x + y[1] if y[0] == 0 else x, read.cigar, 0) ins = reduce(lambda x, y: x + y[1] if y[0] == 1 else x, read.cigar, 0) delt = reduce(lambda x, y: x + y[1] if y[0] == 2 else x, read.cigar, 0) # NM is edit distance: NM = INS + DEL + SUB sub = tags['NM'] - ins - delt length = match + ins + delt # Count clips: clipps = reduce( lambda x, y: x + y[1] if (y[0] == 4 or y[0] == 5) else x, read.cigar, 0) if with_clipps: length += clipps iden = float(match - sub) / match if with_clipps: acc = 1.0 - (float(tags['NM'] + clipps) / length) else: acc = 1.0 - (float(tags['NM']) / length) coverage = float(read.query_alignment_length) / read.infer_query_length() direction = '-' if read.is_reverse else '+' results = OrderedDict([ ("name", name), ("ref", read.reference_name), ("coverage", coverage), ("direction", direction), ("aln_length", length), ("insertion", ins), ("deletion", delt), ("mismatch", sub), ("match", match - sub), ("identity", iden), ("accuracy", acc), ("clipps", clipps), ]) return results
def get_realtime_quotes(order_book_id_list, open_only=False, include_limit=False): import tushare as ts code_list = [order_book_id_2_tushare_code(code) for code in order_book_id_list] max_len = 800 loop_cnt = int(math.ceil(float(len(code_list)) / max_len)) total_df = reduce(lambda df1, df2: df1.append(df2), [ts.get_realtime_quotes([code for code in code_list[i::loop_cnt]]) for i in range(loop_cnt)]) total_df["is_index"] = False index_symbol = ["sh", "sz", "hs300", "sz50", "zxb", "cyb"] index_df = ts.get_realtime_quotes(index_symbol) index_df["code"] = index_symbol index_df["is_index"] = True total_df = total_df.append(index_df) columns = set(total_df.columns) - set(["name", "time", "date", "code"]) # columns = filter(lambda x: "_v" not in x, columns) for label in columns: total_df[label] = total_df[label].map(lambda x: 0 if str(x).strip() == "" else x) total_df[label] = total_df[label].astype(float) total_df["chg"] = total_df["price"] / total_df["pre_close"] - 1 total_df["order_book_id"] = total_df["code"] total_df["order_book_id"] = total_df["order_book_id"].apply(tushare_code_2_order_book_id) total_df = total_df.set_index("order_book_id").sort_index() total_df["order_book_id"] = total_df.index total_df["datetime"] = total_df["date"] + " " + total_df["time"] # total_df["datetime"] = total_df["datetime"].apply( # lambda x: convert_dt_to_int(datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))) total_df["close"] = total_df["price"] total_df["last"] = total_df["price"] total_df = total_df.rename(columns={ "{}{}_p".format(base_name, i): "{}{}".format(base_name, i) for i in range(1, 6) for base_name in ["a", "b"] }) total_df = total_df.rename(columns={"pre_close": "prev_close"}) del total_df["code"] del total_df["is_index"] del total_df["date"] del total_df["time"] if include_limit: total_df["limit_up"] = total_df.apply( lambda row: row.prev_close * (1.1 if "ST" not in row["name"] else 1.05), axis=1).round(2) total_df["limit_down"] = total_df.apply( lambda row: row.prev_close * (0.9 if "ST" not in row["name"] else 0.95), axis=1).round(2) if open_only: total_df = total_df[total_df.open > 0] return total_df
def fetch(task_id=1, batch_size=32): # task data train, test = load_task(datadir, task_id) data = train + test # metadata vocab = sorted(reduce(lambda x, y: x | y, (set(list(chain.from_iterable(s)) + q + a) for s, q, a in data))) word_idx = dict((c, i + 1) for i, c in enumerate(vocab)) # sizes max_story_size = max(map(len, (s for s, _, _ in data))) mean_story_size = int(np.mean([ len(s) for s, _, _ in data ])) sentence_size = max(map(len, chain.from_iterable(s for s, _, _ in data))) query_size = max(map(len, (q for _, q, _ in data))) memory_size = min(50, max_story_size) vocab_size = len(word_idx) + 1 # +1 for nil word sentence_size = max(query_size, sentence_size) # for the position # train/validation/test sets S, Q, A = vectorize_data(train, word_idx, sentence_size, memory_size) trainS, valS, trainQ, valQ, trainA, valA = cross_validation.train_test_split(S, Q, A, test_size=.1, random_state=None) testS, testQ, testA = vectorize_data(test, word_idx, sentence_size, memory_size) # params n_train = trainS.shape[0] n_test = testS.shape[0] n_val = valS.shape[0] batches = zip(range(0, n_train-batch_size, batch_size), range(batch_size, n_train, batch_size)) batches = [(start, end) for start, end in batches] data = { 'trS' : trainS, 'trQ' : trainQ, 'trA' : trainA, 'teS' : testS, 'teQ' : testQ, 'teA' : testA, 'vaS' : valS, 'vaQ' : valQ, 'vaA' : valA, 'batches' : batches } metadata = { 'vocab_size' : vocab_size, 'vocab' : vocab, 'word_idx' : word_idx, 'sentence_size' : sentence_size, 'memory_size' : memory_size } return data, metadata