抓取上传文件_面试问答

我花了整整一天的时间试图弄清楚如何实现这一点。最终，我遇到了一个从2016年开始就从未合并过的Scrapy拉取请求，并实现了一个多部分表单请求：

from scrapy import FormRequestfrom six.moves.urllib.parse import urljoin, urlenpreimport lxml.htmlfrom parsel.selector import create_root_nodeimport siximport stringimport randomfrom scrapy.http.request import Requestfrom scrapy.utils.python import to_bytes, is_listlikefrom scrapy.utils.response import get_base_urlclass MultipartFormRequest(FormRequest):    def __init__(self, *args, **kwargs):        formdata = kwargs.pop('formdata', None)        kwargs.setdefault('method', 'POST')        super(MultipartFormRequest, self).__init__(*args, **kwargs)        content_type = self.headers.setdefault(b'Content-Type', [b'multipart/form-data'])[0]        method = kwargs.get('method').upper()        if formdata and method == 'POST' and content_type == b'multipart/form-data': items = formdata.items() if isinstance(formdata, dict) else formdata self._boundary = '' # enpre the data using multipart spec self._boundary = to_bytes(''.join(     random.choice(string.digits + string.ascii_letters) for i in range(20)), self.encoding) self.headers[b'Content-Type'] = b'multipart/form-data; boundary=' + self._boundary request_data = _multpart_enpre(items, self._boundary, self.encoding) self._set_body(request_data)class MultipartFile(object):    def __init__(self, name, content, mimetype='application/octet-stream'):        self.name = name        self.content = content        self.mimetype = mimetypedef _get_form_url(form, url):    if url is None:        return urljoin(form.base_url, form.action)    return urljoin(form.base_url, url)def _urlenpre(seq, enc):    values = [(to_bytes(k, enc), to_bytes(v, enc))   for k, vs in seq   for v in (vs if is_listlike(vs) else [vs])]    return urlenpre(values, doseq=1)def _multpart_enpre(items, boundary, enc):    body = []    for name, value in items:        body.append(b'--' + boundary)        if isinstance(value, MultipartFile): file_name = value.name content = value.content content_type = value.mimetype body.append(     b'Content-Disposition: form-data; name="' + to_bytes(name, enc) + b'"; filename="' + to_bytes(file_name,enc) + b'"') body.append(b'Content-Type: ' + to_bytes(content_type, enc)) body.append(b'') body.append(to_bytes(content, enc))        else: body.append(b'Content-Disposition: form-data; name="' + to_bytes(name, enc) + b'"') body.append(b'') body.append(to_bytes(value, enc))    body.append(b'--' + boundary + b'--')    return b'rn'.join(body)def _get_form(response, formname, formid, formnumber, formxpath):    """Find the form element """    root = create_root_node(response.text, lxml.html.HTMLParser,      base_url=get_base_url(response))    forms = root.xpath('//form')    if not forms:        raise ValueError("No <form> element found in %s" % response)    if formname is not None:        f = root.xpath('//form[@name="%s"]' % formname)        if f: return f[0]    if formid is not None:        f = root.xpath('//form[@id="%s"]' % formid)        if f: return f[0]    # Get form element from xpath, if not found, go up    if formxpath is not None:        nodes = root.xpath(formxpath)        if nodes: el = nodes[0] while True:     if el.tag == 'form':         return el     el = el.getparent()     if el is None:         break        enpred = formxpath if six.PY3 else formxpath.enpre('unipre_escape')        raise ValueError('No <form> element found with %s' % enpred)    # If we get here, it means that either formname was None    # or invalid    if formnumber is not None:        try: form = forms[formnumber]        except IndexError: raise IndexError("Form number %d not found in %s" %       (formnumber, response))        else: return formdef _get_inputs(form, formdata, dont_click, clickdata, response):    try:        formdata = dict(formdata or ())    except (ValueError, TypeError):        raise ValueError('formdata should be a dict or iterable of tuples')    inputs = form.xpath('descendant::textarea'  '|descendant::select'  '|descendant::input[not(@type) or @type['  ' not(re:test(., "^(?:submit|image|reset)$", "i"))'  ' and (../@checked or'  '  not(re:test(., "^(?:checkbox|radio)$", "i")))]]',  namespaces={      "re": "http://exslt.org/regular-expressions"})    values = [(k, u'' if v is None else v)   for k, v in (_value(e) for e in inputs)   if k and k not in formdata]    if not dont_click:        clickable = _get_clickable(clickdata, form)        if clickable and clickable[0] not in formdata and not clickable[0] is None: values.append(clickable)    values.extend(formdata.items())    return valuesdef _value(ele):    n = ele.name    v = ele.value    if ele.tag == 'select':        return _select_value(ele, n, v)    return n, vdef _select_value(ele, n, v):    multiple = ele.multiple    if v is None and not multiple:        # Match browser behaviour on simple select tag without options selected        # And for select tags wihout options        o = ele.value_options        return (n, o[0]) if o else (None, None)    elif v is not None and multiple:        # This is a workround to bug in lxml fixed 2.3.1        # fix https://github.com/lxml/lxml/commit/57f49eed82068a20da3db8f1b18ae00c1bab8b12#L1L1139        selected_options = ele.xpath('.//option[@selected]')        v = [(o.get('value') or o.text or u'').strip() for o in selected_options]    return n, vdef _get_clickable(clickdata, form):    """    Returns the clickable element specified in clickdata,    if the latter is given. If not, it returns the first    clickable element found    """    clickables = [        el for el in form.xpath( 'descendant::*[(self::input or self::button)' ' and re:test(@type, "^submit$", "i")]' '|descendant::button[not(@type)]', namespaces={"re": "http://exslt.org/regular-expressions"})    ]    if not clickables:        return    # If we don't have clickdata, we just use the first clickable element    if clickdata is None:        el = clickables[0]        return (el.get('name'), el.get('value') or '')    # If clickdata is given, we compare it to the clickable elements to find a    # match. We first look to see if the number is specified in clickdata,    # because that uniquely identifies the element    nr = clickdata.get('nr', None)    if nr is not None:        try: el = list(form.inputs)[nr]        except IndexError: pass        else: return (el.get('name'), el.get('value') or '')    # We didn't find it, so now we build an XPath expression out of the other    # arguments, because they can be used as such    xpath = u'.//*' +  u''.join(u'[@%s="%s"]' % c for c in six.iteritems(clickdata))    el = form.xpath(xpath)    if len(el) == 1:        return (el[0].get('name'), el[0].get('value') or '')    elif len(el) > 1:        raise ValueError("Multiple elements found (%r) matching the criteria "   "in clickdata: %r" % (el, clickdata))    else:        raise ValueError('No clickable element matching clickdata: %r' % (clickdata,))

这是我用来调用请求的代码（在我的情况下，我需要上传图片）：

with open(img_path, 'rb') as file:    img = file.read()    file_name = os.path.basename(img_path)    multipart_file = MultipartFile(file_name, img, "image/png")    form_data = {        "param": "value", # this is an example of a text parameter        "PicUpload": multipart_file    }    yield MultipartFormRequest(url=upload_url, formdata=form_data,         callback=self.my_callback)

可惜的是，已经过去了这么长时间，而Scrapy仍然没有内置的方法来执行此操作，特别是因为几年前有人编写了一个非常简单的实现。

抓取上传文件

面试问答相关栏目本月热门文章