Python DuplicatesPipeline Example

说明

python duplicatespipeline示例是从最受好评的开源项目中提取的实现代码,你可以参考下面示例的使用方式。

编程语言: Python

命名空间/包名称: pa11ycrawlerpipelines

示例#1
文件: test_pipelines.py项目: edx/pa11ycrawler

def test_duplicates_pipeline_courseware_start():
    dup_pl = DuplicatesPipeline()
    spider = object()
    item1 = {"url": "https://courses.edx.org/courses/foo/courseware/bar/baz/"}
    processed1 = dup_pl.process_item(item1, spider)
    assert item1 == processed1

    item2 = {"url": "https://courses.edx.org/courses/foo/courseware/bar/baz/1"}
    with pytest.raises(DropItem):
        dup_pl.process_item(item2, spider)

    item3 = {"url": "https://courses.edx.org/courses/foo/courseware/bar/baz/2"}
    processed3 = dup_pl.process_item(item3, spider)
    assert item3 == processed3

    item4 = {"url": "https://courses.edx.org/courses/quux/courseware/bar/baz/1"}
    processed4 = dup_pl.process_item(item4, spider)
    assert item4 == processed4

    item5 = {"url": "https://courses.edx.org/courses/quux/courseware/bar/baz/"}
    with pytest.raises(DropItem):
        dup_pl.process_item(item5, spider)

    item6 = {"url": "https://courses.edx.org/courses/quux/courseware/bar/baz/6"}
    processed6 = dup_pl.process_item(item6, spider)
    assert item6 == processed6

示例#2
文件: test_pipelines.py项目: singingwolfboy/pa11ycrawler

def test_duplicates_pipeline():
    dup_pl = DuplicatesPipeline()
    spider = object()
    # first item: no problem
    item1 = {"url": "google.com"}
    processed1 = dup_pl.process_item(item1, spider)
    assert item1 == processed1

    # second item is different, so no problem
    item2 = {"url": "edx.org"}
    processed2 = dup_pl.process_item(item2, spider)
    assert item2 == processed2

    # third is the same as a previous, so raises an exception
    item3 = {"url": "google.com"}
    with pytest.raises(DropItem):
        dup_pl.process_item(item3, spider)

    # fourth is different, so no problem
    item4 = {"url": "edx.org/foo"}
    processed4 = dup_pl.process_item(item4, spider)
    assert item4 == processed4

    # fifth has other, different properties, but the URL is the same
    item5 = {"url": "edx.org/foo", "page_title": "TitleCase"}
    with pytest.raises(DropItem):
        dup_pl.process_item(item5, spider)

示例#3
文件: test_pipelines.py项目: singingwolfboy/pa11ycrawler

def test_duplicates_pipeline_querystring():
    dup_pl = DuplicatesPipeline()
    spider = object()
    item1 = {"url": "https://courses.edx.org/register?next=foo"}
    processed1 = dup_pl.process_item(item1, spider)
    assert item1 == processed1

    item2 = {"url": "https://courses.edx.org/register?next=bar"}
    with pytest.raises(DropItem):
        dup_pl.process_item(item2, spider)

    item3 = {"url": "https://courses.edx.org/register?foo=bar"}
    with pytest.raises(DropItem):
        dup_pl.process_item(item3, spider)

    item4 = {"url": "https://courses.edx.org/register"}
    with pytest.raises(DropItem):
        dup_pl.process_item(item4, spider)

展开阅读全文