def DownloadParseDictionary():
# 1. *************************************** 下载一级url ***************************************
print( 检查是否下载。。。。。。url。。。。。。frist: m_RUL_PRE)
filename os.path.join(PATH_1_A_Z, 1.html )
if os.path.isfile(filename):
print( 文件已存在-不需再次下载: filename)
else:
html getHtml(m_RUL_PRE)
if len(str(html)) 1:
write(filename, html)
print( babla_down_success(First): m_RUL_PRE)
else:
print( babla_down_fail(First): m_RUL_PRE)
html read(filename)
soup BeautifulSoup(html, lxml )
data str(soup.select( .letter-nav )[0])
print(data)
urls re.findall(re.compile( (http[Ss] ?) ), data)
m_urls []
for url in urls:
if 0-9 not in url:
m_urls.append(url)
print( 总共字母数--[ str(len(m_urls)) ] )
if len(m_urls) 5:
print( 第一个网页下载出问题------找lxz )
sys.exit()
# 2. *************************************** 下载二级url ***************************************
filename2s []
while NeedDownload2(len(m_urls)):
filename2s []
for url in m_urls:
_ url.split( / )
filename os.path.join(PATH_2_A_Z, _[-2] _ _[-1] .html )
filename2s.append(filename)
if os.path.isfile(filename):
print( 不需下载。。。。。。url。。。。。。second。。。。。。: url)
continue
print( 下载。。。。。。url。。。。。。second。。。。。。: url)
html getHtml(url)
if len(str(html)) 500:
write(filename, html)
write(filename.replace(PATH_2_A_Z, PATH_3_A_Z), html)
print( babla_down_success(Second): url)
else:
print( babla_down_fail(Second): url)
# 3. *************************************** 解析出三级url ***************************************
url3s []
for filename in filename2s:
_char filename.split(os.path.sep)[-1].split( _ )[0]
data read(filename)
soup BeautifulSoup(data, lxml )
print( 解析出三级url。。。。。。 filename)
data str(soup.select( .dict-pag )[0])
# print( dict-pag...... data)
# 上面找出url部分
urls re.findall(re.compile( href ([Ss] ?) ), data)
if len(urls) 0 or len(urls) 1:
url3s.append(m_RUL_PRE _char / str(1))
else:
print(urls)
maxIndexData urls[len(urls) - 1].split( / )
maxIndex maxIndexData[-1]
for i in range(1, int(maxIndex) 1):
url3s.append(m_RUL_PRE _char / str(i))
urlss
for url in url3s:
urlss (url n )
write(os.path.join(PATH_2_A_Z, 1onlySee3url.csv ), urlss)
# 4. *************************************** 下载出三级url ***************************************
while NeedDownload3(len(url3s)):
_url3HasCount Url3DownloadedCount()
for url in url3s:
filename url.split( / )[-2] _ url.split( / )[-1] .html
filename os.path.join(PATH_3_A_Z, filename)
print(filename)
if os.path.isfile(filename):
print( 无需下载。。。。。。url。。。。。。third。。。。。。: url)
continue
print( 正在下载。。。。。。url。。。。。。third。。。。。。: url)
html getHtml(url)
if len(str(html)) 200:
write(filename, html)
_url3HasCount 1
print( babla_down_success(Third): url)
else:
print( babla_down_fail(Third): url)
print( 下载进度------------url------------3------------:[ str(_url3HasCount) / str(len(url3s)) ] )
# 5. *************************************** 解析出三级url中的单词 ***************************************
all_words []
_parse_count 0
for url in url3s:
_parse_count 1
filename url.split( / )[-2] _ url.split( / )[-1] .html
if _parse_count % 20 0 or _parse_count len(url3s):
print( 正在解析------------3级网页------------[ filename ]------[ str(_parse_count) / str(len(url3s)) ] )
filename os.path.join(PATH_3_A_Z, filename)
soup BeautifulSoup(read(filename), lxml )
aFileDivList soup.select( .content-column .content .dict-select-wrapper .dict-select-column )
for aDiv in aFileDivList:
aDivWords re.findall(re.compile( /span ([sS] ?) /a ), str(aDiv))
for aWord in aDivWords:
aWord GetWord(aWord)
if not in aWord:
all_words.append(aWord)
dic_words
for aWord in all_words:
dic_words (aWord n )
write(m_RESULT, dic_words)