# 获得2019-11-20过去365天的数据
days = []
today = datetime.date.today()
for i in range(0,366,1):
daybeforetoday = today + datetime.timedelta(days=-i)
days.append(daybeforetoday.strftime('%Y-%m-%d'))
days = list(reversed(days))
print(days)
if resp['code'] == '100000':
html = resp['data']['html']
html = etree.HTML(html)
data = html.xpath('//div[@node-type="comment_list"]')
for i in data:
# 评论人昵称
nick_name = i.xpath('.//div[@class="WB_text"]/a[1]/text()')
# 评论内容
text = i.xpath('.//div[@class="WB_text"]')
text = [i.xpath('string(.)') for i in text]
# 头像地址
pic_url = i.xpath('.//div[@class="WB_face W_fl"]/a/img/@src')
print(len(nick_name),len(text),len(pic_url))
write_comment([i.strip() for i in text], pic_url, nick_name)
其中写入文件的函数和下载图片的函数如下:
# 下载图片
def download_pic(url, nick_name):
if not url:
return
if not os.path.exists(pic_file_path):
os.mkdir(pic_file_path)
resp = requests.get(url)
if resp.status_code == 200:
with open(pic_file_path + f'/{nick_name}.jpg', 'wb') as f:
f.write(resp.content)
# 写入留言内容
def write_comment(comment, pic_url, nick_name):
f = open('comment.txt', 'a', encoding='utf-8')
for index, i in enumerate(comment):
if ':' not in i and '回复' not in i and i != '':
# 去除评论的评论
w_comment = i.strip().replace(':', '').replace('\n', '')
# 写入评论
f.write(w_comment.replace('等人', '').replace('图片评论', '')+'\n')
# 获得头像
download_pic(pic_url[index], nick_name[index])
from get_index import BaiduIndex
if __name__ == "__main__":
keywords = ['区块链']
results = {'区块链':[]}
baidu_index = BaiduIndex(keywords, '2019-05-04', '2019-11-04')
for index in baidu_index.get_index():
if index['type'] == 'all':
results[index['keyword']].append(index['index'])
print(results)
def pettitt(data):
data = np.array(data)
n = data.shape[0]
k = range(n)
dataT = pd.Series(data)
r = dataT.rank()
Uk = [2*np.sum(r[0:x])-x*(n + 1) for x in k]
Uabs = list(np.abs(Uk))
U = np.max(Uabs)
K = Uabs.index(U)
p = 2 * np.exp((-6 * (U**2))/(n**3 + n**2))
if p <= 0.5:
# 显著
result = 'yes'
else:
# 不显著
result = 'no'
return K, result
length = len(results['区块链'])
locations = []
for i in range(0, length, 1):
pos, result = pettitt(results['区块链'][i:i+29])
if result == 'yes':
locations.append(pos+i)
print(set(locations))
结果如下:
这样看实在是不好看出什么,用matplotlib可视化一下:
print(results)
plt.plot(range(len(results['区块链'])), [int(i) for i in results['区块链']])
for i in locations:
plt.plot(i,int(results['区块链'][i]),'ks')
my_y_ticks = np.arange(0, 250000, 50000)
#显示范围为0至25000,每5000显示一刻度
plt.yticks(my_y_ticks)
plt.show()
def jieba_cut_and_save_file(inputList, output_cleaned_file=False):
"""
1. 读取中文文件并分词句子
2. 可以将分词后的结果保存到文件
3. 如果已经存在经过分词的数据文件则直接加载
"""
output_file = os.path.join('./data/', 'cleaned_' + 'trainMatrix.txt')
if os.path.exists(output_file):
lines = list(open(output_file, 'r').readlines())
lines = [line.strip('\n').split(' ') for line in lines]
else:
lines = [list(jieba.cut(clean_str(line))) for line in inputList]
# 将句子进行clean_str处理后进行结巴分词
lines = [[word for word in line if word != ' '] for line in lines]
if output_cleaned_file:
with open(output_file, 'w') as f:
for line in lines:
f.write(" ".join(line) + '\n')
vocabulary = createVocabList(lines)
# 根据词典生成词向量化器,并进行词向量化
setOfWords2Vec = setOfWords2VecFactory(vocabulary)
vectorized = [setOfWords2Vec(news) for news in lines]
return vectorized, vocabulary
def clean_str(string):
"""
1. 将除汉字外的字符转为一个空格
2. 除去句子前后的空格字符
"""
string = re.sub(r'[^\u4e00-\u9fff]', ' ', string)
string = re.sub(r'\s{2,}', ' ', string)
return string.strip()
def setOfWords2VecFactory(vocabList):
"""
通过给定词典,构造该词典对应的setOfWords2Vec
"""
#优化:通过事先构造词语到索引的哈希表,加快转化
index_map = {}
for i, word in enumerate(vocabList):
index_map[word] = i
def setOfWords2Vec(news):
"""
以在构造时提供的词典为基准词典向量化一条新闻
"""
result = [0]*len(vocabList)
for word in news:
#通过默认值查询同时起到获得索引与判断有无的作用
index = index_map.get(word, None)
if index:
result[index] = 1
return result
return setOfWords2Vec
5.最后是测试需要用到的向量化新闻
原理和前边对训练集的向量化相似,但是由于是测试时对单个新闻的向量化,因此我们把它分开了。
def vectorize_newslist(news_list, vocabulary):
"""
将新闻列表新闻向量化,变成词向量矩阵
注:如果没有词典,默认值为从集合中创造
"""
# 分词与过滤
cut_news_list = [list(jieba.cut(clean_str(news))) for news in news_list]
# 根据词典生成词向量化器,并进行词向量化
setOfWords2Vec = setOfWords2VecFactory(vocabulary)
vectorized = [setOfWords2Vec(news) for news in cut_news_list]
return vectorized, vocabulary