From e46890042c7549c33e8722a5242c450c2d02eab8 Mon Sep 17 00:00:00 2001 From: red Date: Wed, 27 Feb 2019 15:21:49 +0800 Subject: [PATCH] 1,set open() function with UTF-8 In this way, non-english environment will not report decoding error 2,logging.info("xxx:",xxx) missed %s mark. it should be logging.info("xxx:%s",xxx) --- .gitignore | 3 +++ data_load.py | 4 ++-- prepro.py | 30 +++++++++++++++--------------- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index 55bc9f0..23bc10c 100644 --- a/.gitignore +++ b/.gitignore @@ -101,3 +101,6 @@ ENV/ corpora logdir preprocessed +.idea +iwslt2016 +log diff --git a/data_load.py b/data_load.py index ba31839..f017f38 100644 --- a/data_load.py +++ b/data_load.py @@ -21,7 +21,7 @@ def load_vocab(vocab_fpath): Returns two dictionaries. ''' - vocab = [line.split()[0] for line in open(vocab_fpath, 'r').read().splitlines()] + vocab = [line.split()[0] for line in open(vocab_fpath, 'r',encoding='UTF-8').read().splitlines()] token2idx = {token: idx for idx, token in enumerate(vocab)} idx2token = {idx: token for idx, token in enumerate(vocab)} return token2idx, idx2token @@ -38,7 +38,7 @@ def load_data(fpath1, fpath2, maxlen1, maxlen2): sents2: list of target sents ''' sents1, sents2 = [], [] - with open(fpath1, 'r') as f1, open(fpath2, 'r') as f2: + with open(fpath1, 'r',encoding='UTF-8') as f1, open(fpath2, 'r',encoding='UTF-8') as f2: for sent1, sent2 in zip(f1, f2): if len(sent1.split()) + 1 > maxlen1: continue # 1: if len(sent2.split()) + 1 > maxlen2: continue # 1: diff --git a/prepro.py b/prepro.py index c08c7c9..5bb3610 100644 --- a/prepro.py +++ b/prepro.py @@ -34,14 +34,14 @@ def prepro(hp): logging.info("# Preprocessing") # train - _prepro = lambda x: [line.strip() for line in open(x, 'r').read().split("\n") \ + _prepro = lambda x: [line.strip() for line in open(x, 'r',encoding='UTF-8').read().split("\n") \ if not line.startswith("<")] prepro_train1, prepro_train2 = _prepro(train1), _prepro(train2) assert len(prepro_train1)==len(prepro_train2), "Check if train source and target files match." # eval _prepro = lambda x: [re.sub("<[^>]+>", "", line).strip() \ - for line in open(x, 'r').read().split("\n") \ + for line in open(x, 'r',encoding='UTF-8').read().split("\n") \ if line.startswith("