Python jieba 模块,load_userdict() 实例源码
我们从Python开源项目中,提取了以下37个代码示例,用于说明如何使用jieba.load_userdict()。
def __init__(self, dict_path = ''):
super(Singleton, self).__init__()
if not hasattr(self,'_stop_words'):
#???????
if dict_path:
jieba.load_userdict(dict_path)
self._stop_words = set((
'', ' ', '\n', "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are",
"by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it",
"this", "then", "at", "have", "all", "not", "one", "has", "or", "that"
))
def post_desc_counter():
""" ??????
"""
# import thulac
post = open(os.path.join("data", "post_require.txt"),
"r", encoding="utf-8").read()
# ?? thulac ??
# thu = thulac.thulac(seg_only=True)
# thu.cut(post, text=True)
# ?? jieba ??
file_path = os.path.join("data", "user_dict.txt")
jieba.load_userdict(file_path)
seg_list = jieba.cut(post, cut_all=False)
counter = dict()
for seg in seg_list:
counter[seg] = counter.get(seg, 1) + 1
counter_sort = sorted(
counter.items(), key=lambda value: value[1], reverse=True)
pprint(counter_sort)
with open(os.path.join("data", "post_pre_desc_counter.csv"),
"w+", encoding="utf-8") as f:
f_csv = csv.writer(f)
f_csv.writerows(counter_sort)
def get_hot_words(text):
jieba.analyse.set_stop_words(STOPWORDS_PATH)
jieba.load_userdict(USER_CORPUS)
df = pd.DataFrame(jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=()))
print(df)
df.to_excel('./hotwords/DM.xlsx', 'DM')
def parse():
"""parse the comments"""
import jieba
import jieba.posseg as pseg
# Load User's Dictionary
path_list = os.getcwd().split('/')
path_list.append("dict.txt")
dict_path = '/'.join(path_list)
jieba.load_userdict(dict_path)
# Disimss These Flags
dismiss = ['b', 'c', 'r', 'uj', 'u', 'p', 'q', 'uz', 't', 'ul', 'k', 'f',
'ud', 'ug', 'uv']
comments = Comment.query.all()
for comment in comments:
word_list = []
pseg_cut = pseg.cut(comment.body)
for word, flag in pseg_cut:
if flag not in dismiss:
word_list.append(word)
comment.parsed = '/'.join(word_list)
db.session.add(comment)
print "Comment %04d Parsed!" % comment.id
db.session.commit()
print "ALL DONE!"
def __init__(self):
self.encoderFile = "./question.txt"
self.decoderFile = './answer.txt'
self.dictFile = 'word_dict.txt'
# ???????????
jieba.load_userdict(self.dictFile)
# ???????
self.stopwordsFile = "./preprocessing/stopwords.dat"
def __init__(self):
print("tensorflow version: ", tf.__version__)
tf.reset_default_graph()
self.encoder_vec_file = "./preprocessing/enc.vec"
self.decoder_vec_file = "./preprocessing/dec.vec"
self.encoder_vocabulary = "./preprocessing/enc.vocab"
self.decoder_vocabulary = "./preprocessing/dec.vocab"
self.dictFile = './word_dict.txt'
self.batch_size = 1
self.max_batches = 10000
self.show_epoch = 100
self.model_path = './model/'
# jieba????
jieba.load_userdict(self.dictFile)
self.model = dynamicSeq2seq(encoder_cell=LSTMCell(20),
decoder_cell=LSTMCell(40),
encoder_vocab_size=540,
decoder_vocab_size=1600,
embedding_size=20,
attention=True,
bidirectional=True,
debug=False,
time_major=True)
self.location = ["??", "??", "??", "??","??"]
self.user_info = {"__username__":"Stephen", "__location__":"??"}
self.robot_info = {"__robotname__":"JiJi"}
self.dec_vocab = {}
self.enc_vocab = {}
tag_location = ''
with open(self.encoder_vocabulary, "r") as enc_vocab_file:
for index, word in enumerate(enc_vocab_file.readlines()):
self.enc_vocab[word.strip()] = index
with open(self.decoder_vocabulary, "r") as dec_vocab_file:
for index, word in enumerate(dec_vocab_file.readlines()):
self.dec_vocab[index] = word.strip()
def main(argv):
f = open('freeRiderData.txt')
jieba.load_userdict('KeywordDictionary.txt')
for line in f:
# ????
seg_list = jieba.cut(line, cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))
return
def __init__(self):
self.__root_filepath = "f_dict/"
jieba.load_userdict("f_dict/user.dict") # ??????
# ????????
self.__phrase_dict = self.__get_phrase_dict()
self.__positive_dict = self.__get_dict(self.__root_filepath + "positive_dict.txt")
self.__negative_dict = self.__get_dict(self.__root_filepath + "negative_dict.txt")
self.__conjunction_dict = self.__get_dict(self.__root_filepath + "conjunction_dict.txt")
self.__punctuation_dict = self.__get_dict(self.__root_filepath + "punctuation_dict.txt")
self.__adverb_dict = self.__get_dict(self.__root_filepath + "adverb_dict.txt")
self.__denial_dict = self.__get_dict(self.__root_filepath + "denial_dict.txt")
def __init():
user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt")
jieba.load_userdict(user_dict_path)
jieba.add_word(u"??", 10000)
jieba.suggest_freq((u"?", u"??"))
jieba.suggest_freq((u"??", u"??"))
jieba.suggest_freq((u"??", u"??"))
jieba.suggest_freq((u"??", u"?"))
def __init():
user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt")
jieba.load_userdict(user_dict_path)
jieba.add_word("??", 10000)
jieba.suggest_freq(("?", "??"))
jieba.suggest_freq(("??", "??"))
jieba.suggest_freq(("??", "??"))
jieba.suggest_freq(("??", "?"))
def __init__(self,userDict=None,conf={}):
self.userDict=userDict
self.conf={}
self.configFromDict(conf)
if self.userDict:
jieba.load_userdict(userDict)
self.configDefault()
def __init__(self, custom_dict_path=CUSTOM_DICTIONARY_PATH):
super(JiebaClient, self).__init__()
try:
jieba.load_userdict(custom_dict_path)
self.debug("init JiebaClient, with custom_dict_path=%s", custom_dict_path)
except Exception, e:
self.exception(e)
self.error('@@@@@@@@@@@@@@@@@@@@@@@@@@@ loading custom_dictionary failed')
def cutwords_jieba(self,sentence,userdict='dict/userdict.txt',stopwords='dict/stopwords.txt'):
stropw = []
if userdict:
jieba.load_userdict(userdict)
stropw = [line.strip() for line in open(stopwords,'r',encoding='utf-8').readlines()]
frequency = defaultdict(int)
l = list(jieba.cut(sentence))
for t in l:
frequency[t] += 1
texts = [token for token in frequency if frequency[token] > 0]
rtexts = list(set(texts)-set(stropw))
return rtexts
def read(self,file_name,POS_tag):
f = open(file_name, "r")
tempLine=[]
#vocabulary = {}
jieba.load_userdict("data/metadata/user_dict.txt")
for lineNo,line in enumerate(f.readlines()):
pattern=re.compile("^<d p=\"(.+)\">(.+)</d>")
m=pattern.match(line)
if m:
info=m.group(1).split(',')
temp={"time":int(float(info[0])), \
"text":[word for word,flag in pseg.cut(m.group(2)) \
if word not in self.stop_words and flag not in \
POS_tag ],
"lineno":lineNo+1,
"user":info[6]}
#?????? ???????>3???
temp2=[]
for index,text in enumerate(temp["text"]):
if len(text)>1:
temp2.append(text)
if len(temp2)>=3:
print(temp2)
temp["text"]=temp2
tempLine.append(temp)
lines=sorted(tempLine, key= lambda e:(e.__getitem__('time')))
print len(lines)
return lines#,vocabulary
def __init__(self):
self.ut_path = '../data/ut.data'
self.vocab_path = '../data/vocab.data'
self.ids_path = '../data/ids.data'
self.train_path = '../data/train.data'
self.dev_path = '../data/dev.data'
self.test_path = '../data/test.data'
self.dict_path = '../data/medical.txt'
self.emd_path = '../data/emd/ylemd.bin'
self.tag_path = '../data/tag.data'
jieba.load_userdict(self.dict_path)
def __init__(self):
self.ut_path = '../data/uterance.data'
self.mark_path = '../data/mark.data'
self.vocab_path = '../data/vocab.data'
self.ids_path = '../data/ids.data'
self.train_path = '../data/train.data'
self.dev_path = '../data/dev.data'
self.test_path = '../data/test.data'
self.dict_path = '../data/medical.txt'
self.emd_path = '../data/emd/ylemd.bin'
jieba.load_userdict(self.dict_path)
def __init__(self,size):
self.data_path = 'skin.data'
self.train_size = int(size*0.7)
self.dev_size = int(size*0.1)
self.test_size = size - self.train_size - self.dev_size
jieba.load_userdict('medical.txt')
self.sentences = []
self.orders = []
self.stop_line = []
for line in open('goodbye.data'):
line = line.strip()
self.stop_line.append(line)
self.ac_dialogs = []
def __init__(self):
jieba.load_userdict("keyword.txt")
jieba.load_userdict("mingan_word.txt")
self.topK = 12
self.mingan_list = []
self.get_mingan_list()
def __init__(self):
self.encoderFile = "./question.txt"
self.decoderFile = './answer.txt'
self.dictFile = 'word_dict.txt'
jieba.load_userdict(self.dictFile)
self.stopwordsFile = "./preprocessing/stopwords.dat"
def __init__(self):
print("tensorflow version: ", tf.__version__)
tf.reset_default_graph()
self.encoder_vec_file = "./preprocessing/enc.vec"
self.decoder_vec_file = "./preprocessing/dec.vec"
self.encoder_vocabulary = "./preprocessing/enc.vocab"
self.decoder_vocabulary = "./preprocessing/dec.vocab"
self.dictFile = './word_dict.txt'
self.batch_size = 1
self.max_batches = 100000
self.show_epoch = 100
self.model_path = './model/'
# jieba????
jieba.load_userdict(self.dictFile)
self.model = dynamicSeq2seq(encoder_cell=LSTMCell(40),
decoder_cell=LSTMCell(40),
encoder_vocab_size=600,
decoder_vocab_size=1600,
embedding_size=20,
attention=False,
bidirectional=False,
debug=False,
time_major=True)
self.location = ["??", "??", "??", "??"]
self.user_info = {"__username__":"yw", "__location__":"??"}
self.robot_info = {"__robotname__":"Rr"}
self.dec_vocab = {}
self.enc_vocab = {}
self.dec_vecToSeg = {}
tag_location = ''
with open(self.encoder_vocabulary, "r") as enc_vocab_file:
for index, word in enumerate(enc_vocab_file.readlines()):
self.enc_vocab[word.strip()] = index
with open(self.decoder_vocabulary, "r") as dec_vocab_file:
for index, word in enumerate(dec_vocab_file.readlines()):
self.dec_vecToSeg[index] = word.strip()
self.dec_vocab[word.strip()] = index
def cut_main():
jieba.set_dictionary('dict.txt.big')
#jieba.load_userdict("userdict.txt")
if len(sys.argv) == 3:
inputfile = sys.argv[1]
outputfile = sys.argv[2]
else:
print "Usage: python cut.py filetoCut.txt cuttedFile.txt"
sys.exit()
readNcut(inputfile,outputfile)
def cut_main(inputfile,outputfile):
jieba.set_dictionary('dict.txt.big')
#-----user define dict-----
#jieba.load_userdict("userdict.txt")
readNcut(inputfile,outputfile)
def load_userdict():
"""
Load user dictionary
"""
# ????
jieba.load_userdict("./dict/name/amuse.txt");
jieba.load_userdict("./dict/name/sporter.txt");
jieba.load_userdict("./dict/name/politicians.txt");
# ????
jieba.load_userdict("./dict/sport.txt"); # ????
# ????
jieba.load_userdict("./dict/dict.txt");
def load_userdict():
# ????
jieba.load_userdict("./dict/name/amuse.txt");
jieba.load_userdict("./dict/name/sporter.txt");
jieba.load_userdict("./dict/name/politicians.txt");
# ????
jieba.load_userdict("./dict/sport.txt"); # ????
# ????
jieba.load_userdict("./dict/dict.txt");
def words_split(corpus_path):
with open(corpus_path, 'r') as f:
content = f.read()
jieba.load_userdict('data/userdict.txt') # ?????????
jieba.enable_parallel(4) # ????
seg_list = jieba.cut(content, cut_all = False) # ??
return seg_list
# ?????
def __init__(self):
#self.encoderFile = "/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_ask.txt"
#self.decoderFile = '/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_answer.txt'
#self.savePath = '/home/yanwii/Python/NLP/seq2seq/seq2seq_pytorch/data/'
self.encoderFile = "./data/question.txt"
self.decoderFile = "./data/answer.txt"
self.savePath = './data/'
jieba.load_userdict("./data/supplementvocab.txt")
def __init__(self, diction=None, content=None):
self.diction = diction or "assets/location.dict"
self.content = content or ""
jieba.load_userdict(self.diction)
def __init__(self):
self.__root_filepath = "f_dict/"
jieba.load_userdict("f_dict/user.dict") # ??????
# ????????
self.__phrase_dict = self.__get_phrase_dict()
self.__positive_dict = self.__get_dict(self.__root_filepath + "positive_dict.txt")
self.__negative_dict = self.__get_dict(self.__root_filepath + "negative_dict.txt")
self.__conjunction_dict = self.__get_dict(self.__root_filepath + "conjunction_dict.txt")
self.__punctuation_dict = self.__get_dict(self.__root_filepath + "punctuation_dict.txt")
self.__adverb_dict = self.__get_dict(self.__root_filepath + "adverb_dict.txt")
self.__denial_dict = self.__get_dict(self.__root_filepath + "denial_dict.txt")
def gen_dataset_from_baike():
doc_path = os.path.join(rel_ext_dir, 'sample_baike_doc.json')
out_path = os.path.join(rel_ext_dir, 'data/raw_dataset.txt')
name2fb_path = os.path.join(cache_dir, 'DatasetFinder.name2fb.cache')
fb_ttls_path = os.path.join(cache_dir, 'DatasetFinder.fb_ttls.cache')
finder = DatasetFinder.load_from_cache(name2fb_path, fb_ttls_path)
Print('load userdict')
jieba.load_userdict(os.path.join(rel_ext_dir, 'trimmed_baike_dict.txt'))
Print('gen dataset from [%s]' %doc_path)
outf = file(out_path, 'w')
for line in tqdm(file(doc_path), total = nb_lines_of(doc_path)):
p = line.split('\t')
baike_url = p[0].decode('utf-8')
paragraphs = json.loads(p[1])
for paragraph in paragraphs:
sentences = split_sentences(paragraph)
for sentence in sentences:
cases, words = gen_dataset(sentence, finder)
if len(cases) > 0:
out_obj = {
'words': "#".join(words),
'cases': map(str, cases),
}
outf.write("%s\t%s\n" %(baike_url, json.dumps(out_obj, ensure_ascii = False)))
outf.close()
def segment_text(text):
# load user dict
jieba.load_userdict(user_dict)
# set stop words
jieba.analyse.set_stop_words(stop_words)
tags = jieba.analyse.extract_tags(text, topK=20, withWeight=True, allowPOS=())
for tag in tags:
print(str(tag[0]) + "\t" + str(tag[1]))
def __init():
user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt")
jieba.load_userdict(user_dict_path)
jieba.add_word("??", 10000)
jieba.suggest_freq(("?", "??"))
jieba.suggest_freq(("??", "??"))
jieba.suggest_freq(("??", "??"))
jieba.suggest_freq(("??", "?"))
def __init__(self,n_core = 16):
self.rootdir = os.getcwd()
self.STOP_WORDS_LIST = self.load_txt(path.join(self.rootdir, 'resources', 'stopwords_utf8.txt'))
self.STOP_WORDS_LIST = set([re.sub('\n', '', item) for item in self.STOP_WORDS_LIST])
jieba.load_userdict(path.join(self.rootdir, 'resources', 'emotion_user_dict.txt'))
self.n_CORE=n_core
jieba.enable_parallel(self.n_CORE-1)
def __init__(self):
self.__root_path = "data/dict/"
jieba.load_userdict("data/dict/user.dict") # ???????
# ????
self.__phrase_dict = self.__get_phrase_dict()
self.__positive_dict = self.__get_dict(self.__root_path + "positive_dict.txt")
self.__negative_dict = self.__get_dict(self.__root_path + "negative_dict.txt")
self.__conjunction_dict = self.__get_dict(self.__root_path + "conjunction_dict.txt")
self.__punctuation_dict = self.__get_dict(self.__root_path + "punctuation_dict.txt")
self.__adverb_dict = self.__get_dict(self.__root_path + "adverb_dict.txt")
self.__denial_dict = self.__get_dict(self.__root_path + "denial_dict.txt")
def read(self,file_name,timelength):
#f = open("data/1993410.txt", "r")
#timelength = 5640
# f = open("data/5077534.txt", "r")
# timelength = 4740
f = open(file_name, "r")
#timelength = 2582
tempLine=[]
#vocabulary=set()
vocabulary = {}
jieba.load_userdict("data/metadata/user_dict.txt")
for lineNo,line in enumerate(f.readlines()):
pattern=re.compile("^<d p=\"(.+)\">(.+)</d>")
m=pattern.match(line)
if m:
temp={}
temp={"time":int(float(m.group(1).split(',')[0])), \
"text":[word for word,flag in pseg.cut(m.group(2)) \
if word not in self.stop_words and flag not in \
["m","w","g","c","o","p","z","q","un","e","r","x","d","t","h","k","y","u","s","uj","ul","r","eng"] ],
"lineno":lineNo+1}
if len(temp["text"])>3:
tempLine.append(temp)
for item in temp["text"]:
if item not in vocabulary:
vocabulary[item]=0
#print(len(tempLine))
lines=sorted(tempLine, key= lambda e:(e.__getitem__('time')))
# print vocabulary
# print "vocabulary size: %d " % len(vocabulary)
# print "video comment size: %d " % len(lines)
# print lines[12]
self.store(lines,timelength)
return lines,timelength,vocabulary
def __init__(self, user_dict=None):
"""
Init WordSegment Client
@user_dict: user dict
????????????????????????????????
"""
self.user_dict = user_dict
if self.user_dict is not None:
jieba.load_userdict(self.user_dict)
def clean():
jieba.load_userdict("../data/segmention/unigram.txt")
output = open("./train.data", "w")
with open("../data/prepare_data", "r") as f:
for line in f:
line = unicode(line.strip())
#??????
line = line.lower()
#?????query
if len(line) <= 2:
continue
#???????id?query
if re.match('[0-9]{18}', line) != None:
continue
#???????query
eng_flag = True
for i in line:
if i >= u'\u4e00' and i <= u'\u9fa5':
eng_flag = False
break
if eng_flag == True:
continue
#????
ll = jieba.cut(line)
line = []
for i in ll:
if i == u"\u2006" or i == u" " or i == " ":
continue
line.append(i)
#??????????
for i in range(len(line)):
if synonym_dict.has_key(line[i]):
line[i] = synonym_dict[line[i]]
#????query
if line in s_list:
continue
l = ",".join(line)
s_list.append(line)
output.write(l + "\n")
output.close()
return
def __init__(self, itemInfos):
lastTime = time.time()
# itemInfos : dict[(pid, description)]
# train model
jieba.load_userdict('./dict.txt.big.txt')
stopWords = set([line.strip().decode("gbk").lower() for line in open("./stopWords.txt")])
stopWords.add('\n')
stopWords.add(' ')
stopWords.add(u'\u2022')
stopWords.add(u'\xa9')
texts = []
self.name2id = {}
self.id2name = []
for k, v in itemInfos.iteritems():
seg_list = [w.lower() for w in jieba.cut(v, cut_all=False) if w.lower() not in stopWords]
texts.append(list(seg_list))
self.name2id[k] = len(self.id2name)
self.id2name.append(k)
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]
print "start cast :", (time.time() - lastTime)
lastTime = time.time()
dictionary = corpora.Dictionary(texts)
print "dictionary cast :", (time.time() - lastTime)
lastTime = time.time()
corpus = [dictionary.doc2bow(text) for text in texts]
print "doc2bow cast :", (time.time() - lastTime)
lastTime = time.time()
tfidf = models.TfidfModel(corpus)
print "tfid model cast :", (time.time() - lastTime)
lastTime = time.time()
lastTime = time.time()
corpus_tfidf = tfidf[corpus]
print "tfidf corpus cast :", (time.time() - lastTime)
lastTime = time.time()
self.lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
print "lsi model cast :", (time.time() - lastTime)
lastTime = time.time()
#corpus_lsi = lsi[corpus_tfidf]
self.index = similarities.MatrixSimilarity(self.lsi[corpus])
self.corpus = corpus
self.pidName = getPidName()
print "init finish"