我竟然用Python对seo进行优化?

我竟然用Python对seo进行优化?

PluginsKers
2021-01-05 / 1 评论 / 376 阅读 / 正在检测是否收录...

0x01前言

这个可能是站长们挺关心的一个话题...
SEO如何更高效,以及权重提升的方案。
这里使用Python爬虫套代理进行了关键词IP

0x02代码

不浪费萌新时间 表情 代码如下,大佬请继续往下看。

#!/usr/bin/python
# -*- coding: UTF-8 -*-

from time import sleep
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import requests
from selenium import webdriver

breathe_time = 10 # 呼吸时间
target = 'https://www.baidu.com/?wd=PluginsKers' # 目标网址
page = 1 # 代理爬虫起始页

UserAgent = UserAgent()
proxy_pool = []

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--disable-gpu')
options.add_argument('--disable-cache')
options.headless = True
options.add_experimental_option('excludeSwitches', ['enable-automation'])
browser = webdriver.Chrome(chrome_options=options)


def on_proxy():
    global page, requests
    browser.get('https://www.kuaidaili.com/free/inha/'+str(page)+'/')
    proxys = browser.find_elements_by_css_selector('tbody>tr')

    for n in range(len(proxys)):
        proxy_ips = browser.find_elements_by_css_selector(
            'tr>td[data-title="IP"]')
        proxy_ports = browser.find_elements_by_css_selector(
            'tr>td[data-title="PORT"]')
        proxy = proxy_ips[n].text + ':' + proxy_ports[n].text
        try:
            UA = UserAgent.random
            headers = {'User-Agent': UA}
            proxies = {'http': 'http://' + proxy}

            print('当前代理:'+proxy+'\n当前UA:'+UA)
            requests.get(target, headers=headers, proxies=proxies, timeout=2)
        except IOError:
            continue

    print("呼吸周期")
    sleep(breathe_time)
    page = page + 1
    on_proxy()


on_proxy()

预览

划分的很简单,抓取代理为一部分。另外一部分则是使用Selenium进行模拟访问。
如果你知道CC,那么这个就很好理解,它就像是一个低频CC。
呼吸周期则是避免代理站屏蔽你的高频IP,这个很好理解。
很简单的一个小工具。

0x03包

你需要的包有:

  • BeautifulSoup
  • fake_useragent
  • requests
  • selenium
pip install bs4
pip install fake_useragent
pip install requests
pip install selenium
python xxx.py

啊啊啊啊啊啊!!!!百度是不是把我忘了啊! 表情

大狗熊~~

这个版本是2021.5.20折腾出来的,我也想有对象呀,我也想要甜甜的恋爱,可是我当时就是在折腾这些

version

生气 表情 ,不解释了,直接丢代码:

# -*- coding: utf-8 -*-
import time
import re
import random
import requests
from time import sleep
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import urllib.request

sitemap = "https://www.52craft.cc/sitemap.xml"
html = urllib.request.urlopen(sitemap).read().decode('utf-8')
xmlre = re.findall(re.compile(r'<loc>(.*)</loc>'), html)
for i in xmlre:
    print(i)

options = webdriver.ChromeOptions()
options.add_argument("--ignore-certificate-errors")
options.add_argument("disable-infobars")
# options.add_argument("--incognito")
options.add_argument("--no-sandbox")
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--disable-cache")
options.add_argument("log-level=3")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("excludeSwitches", ["enable-logging"])

count = 0
timeout = 20
driver = None
service = None
max = 50


def Unicode():
    val = random.randint(0x4E00, 0x9FBF)
    return chr(val)


def GBK2312():
    head = random.randint(0xB0, 0xF7)
    body = random.randint(0xA1, 0xFE)
    val = f"{head:x} {body:x}"
    str = bytes.fromhex(val).decode("gb2312")
    return str


def rand_str(num):
    H = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
    salt = ""
    for i in range(num):
        salt += random.choice(H)
    return salt


def rand_unicode(num):
    salt = ""
    for i in range(num):
        salt += Unicode()
    return salt


def rand_keyword():
    H = "kers的 网易云热评墙,kers的 motdplatform,KERS博客,Plugins博客,pluginkers,kersblog".split(
        ",")
    return random.choice(H)


def get_proxy():
    return requests.get("http://127.0.0.1:5010/get/").json()


def delete_proxy(proxy):
    print(
        "\033[34m[{}][CONSOLE]\033[0m\033[31m[INFO] 删除代理 {}\033[0m".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), proxy))
    requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))


def retry():
    global driver, service
    driver.quit()
    service.stop()
    key = rand_keyword()
    __init__(key)


def select():
    global count, max
    '''
    列表页面
    '''
    driver.switch_to.window(driver.window_handles[0])
    count += 1
    if count < max:
        WebDriverWait(driver, timeout, poll_frequency=1).until(
            EC.visibility_of_element_located(
                (
                    By.CSS_SELECTOR,
                    '.c-showurl',
                )
            )
        )
        print(
            "\033[34m[{}][CONSOLE]\033[0m\033[36m[INFO] 列表[{}]加载完毕。\033[0m".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), count))

        for a in driver.find_elements_by_css_selector("div.result.c-container"):
            if re.search(r"52craft.cc", a.text):
                if a.find_element_by_css_selector("a[data-click]"):
                    a.find_element_by_css_selector("a[data-click]").click()
                    print(
                        "\033[34m[{}][CONSOLE]\033[0m\033[32m[INFO] 找到结果,并且执行请求。\033[0m".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
                    driver.switch_to.window(driver.window_handles[0])

        time.sleep(4)

        driver.execute_script("window.location.href = '{}';".format(
            driver.find_element_by_css_selector("a.n:last-child").get_attribute("href")))
        select()

    else:
        print(
            "\033[34m[{}][CONSOLE]\033[0m\033[36m[INFO] 翻页达到上限,正在初始化。\033[0m".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
        retry()


def __init__(keyword):
    global options, count, driver, service, timeout
    print(
        "\033[34m[{}][CONSOLE]\033[0m\033[36m[INFO] 使用关键词 {}\033[0m".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), keyword))
    try:
        service = Service(r"D:/Program Files (x86)/Python/chromedriver.exe")
        service.command_line_args()
        service.start()
        proxy = get_proxy().get("proxy")
        url = "https://www.baidu.com/"
        options.add_experimental_option(
            "prefs", {"profile.managed_default_content_settings.images": 1})
        options.add_argument("user-agent={}".format(UserAgent().random))
        options.add_argument("--proxy-server=http://{}".format(proxy))
        capabilities = DesiredCapabilities.CHROME
        capabilities["pageLoadStrategy"] = "none"
        driver = webdriver.Remote(
            service.service_url, desired_capabilities=capabilities, options=options)
        print("\033[34m[{}][CONSOLE]\033[0m\033[36m[INFO] 浏览器初始化完毕 {}\033[0m".format(
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), proxy))
    except Exception:
        print(
            "\033[34m[{}][CONSOLE]\033[0m\033[31m[WARN] 浏览器创建时出现问题。\033[0m".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
        retry()

    try:
        driver.get(url)
        '''
        搜索页面
        '''
        WebDriverWait(driver, timeout, poll_frequency=1).until(
            EC.presence_of_element_located(
                (
                    By.CSS_SELECTOR,
                    'input[name="wd"]',
                )
            )
        )
        # driver.execute_script("window.stop();")

        driver.find_element_by_css_selector(
            'input[name="wd"]').send_keys(keyword)
        driver.find_element_by_css_selector(
            '.s_form_wrapper input.bg.s_btn[type="submit"]').click()

        print("\033[34m[{}][CONSOLE]\033[0m\033[36m[INFO] 页面元素加载完毕。\033[0m".format(
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

    except Exception:
        print(
            "\033[34m[{}][CONSOLE]\033[0m\033[31m[WARN] 网页请求时出现问题。\033[0m\033[0m".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
        delete_proxy(proxy)
        retry()

    try:
        count = 0
        select()

    except Exception as error:
        print(
            "\033[34m[{}][CONSOLE]\033[0m\033[31m[WARN] 页面处理时出现问题:\033[0m {}\033[0m".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), error))

    retry()


__init__(rand_keyword())
70

评论 (1)

取消
  1. 头像
    派蒙不是应急食物
    Windows 10 · Google Chrome

    我刚开始还以为。。他没什么用,卧槽。用来做词尾优化真的牛啊,感谢博主!!!!!!!!就挂了一晚上,先不说引索量,就是收录都起飞了。表情表情

    回复