标签 朗文词典 下的文章

import re
import time
file1 = open(r'..\data\朗文双解清洗'+str(time.time())+'.csv', 'a',encoding='utf-8')
with open(r'..\data\朗文双解.csv',encoding='utf-8') as file:
    conten = file.readlines()
    for lin in conten:
        word=re.findall(r'^.*\t', lin)
        en = re.findall(r'<font class=L_SYL>(.*?)</span>', lin)
        str_first1 = re.sub('<font color=black>',"|",str(en))
        str_first2 = re.sub('<span class=L_POS>',"|",str(str_first1))
        str_first3 = re.sub('</font>',"|",str(str_first2))
        str_first4 = re.sub('<.*?>',"",str(str_first3))
        # print(str_first2)
        gg = str(word)+'\t'+str_first4 +'\n'
        file1.write(gg)
        print(word,str_first4)

import re
import time
newtime = time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))
file1 = open(r'..\data\朗文双解清洗'+str(newtime)+'.csv', 'a',encoding='utf-8')
with open(r'..\data\朗文双解.txt',encoding='utf-8') as file:
    conten = file.readlines()
    for lin in conten:
        word=re.findall(r'[123459789]\..*', lin)
        en = re.findall(r'★.*', lin)
        print(en,word)
        gg = str(en)+'\n'+str(word) +'\n'
        file1.write(gg)