from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from bs4 import BeautifulSoup import time,re path = Service("D:\\MyDrivers\\chromedriver.exe")# # 配置不显示浏览器 chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('User-Agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36') # 创建Chrome实例 。 driver = webdriver.Chrome(service=path,options=chrome_options) lst=["happy","help","evening","great","think","adapt"] for word in lst: url="https://skell.sketchengine.eu/#result?lang=en&query="+word+"&f=concordance" driver.get(url) # 刷新网页获取新数据 driver.refresh() time.sleep(2) # page_source——》获得页面源码 resp=driver.page_source # 解析源码 soup=BeautifulSoup(resp,"html.parser") table = soup.find_all("td") with open("eps.txt",'a+',encoding='utf-8') as f: f.write(f"\n{word}的例子\n") for i in table[0:6]: text=i.text #替换多余的空格 new=re.sub("\s+"," ",text) #写入txt文本 with open("eps.txt",'a+',encoding='utf-8') as f: f.write(re.sub(r"^(\d+\.)",r"\n\1",new)) driver.close()
1. 为了加快访问速度,我们设置不显示浏览器,通过chrome.options实现
2. 最近通过re正则表达式来清理格式。
3. 我们设置table[0:6]来获取前三个句子的内容,最后显示结果如下。
1. This happy mood lasted roughly until last autumn.
2. The lodging was neither convenient nor happy .
3. One big happy family "fighting communism".
1. Applying hot moist towels may help relieve discomfort.
2. The intense light helps reproduce colors more effectively.
3. My survival route are self help books.
1. The evening feast costs another $10.
2. My evening hunt was pretty flat overall.
3. The area nightclubs were active during evenings .
1. The three countries represented here are three great democracies.
2. Our three different tour guides were great .
3. Your receptionist "crew" is great !
1. I said yes immediately without thinking everything through.
2. This book was shocking yet thought provoking.
3. He thought "disgusting" was more appropriate.
1. The novel has been adapted several times.
2. There are many ways plants can adapt .
3. They must adapt quickly to changing deadlines.
from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from bs4 import BeautifulSoup import time,re import os # 配置模拟浏览器的位置 path = Service("D:\\MyDrivers\\chromedriver.exe")# # 配置不显示浏览器 chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('User-Agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36') # 创建Chrome实例 。 def get_wordlist(): wordlist=[] with open("wordlist.txt",'r',encoding='utf-8') as f: lines=f.readlines() for line in lines: word=line.strip() wordlist.append(word) return wordlist def main(lst): driver = webdriver.Chrome(service=path,options=chrome_options) for word in lst: url="https://skell.sketchengine.eu/#result?lang=en&query="+word+"&f=concordance" driver.get(url) driver.refresh() time.sleep(2) # page_source——》页面源码 resp=driver.page_source # 解析源码 soup=BeautifulSoup(resp,"html.parser") table = soup.find_all("td") with open("examples.txt",'a+',encoding='utf-8') as f: f.writelines(f"\n{word}的例子\n") for i in table[0:6]: text=i.text new=re.sub("\s+"," ",text) with open("eps.txt",'a+',encoding='utf-8') as f: f.write(new) # f.writelines(re.sub("(\.\s)(\d+\.)","\1\n\2",new)) if __name__=="__main__": lst=get_wordlist() main(lst) os.startfile("examples.txt")