一种通用数据采集的schema定义形式

{

  "name": "凤凰金融",

  "notice": {

    "data": "attribute",

    "matcher": [

      {

        "match": "xpath",

        "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"

      }

    ],
  "comments": "网站通告"
},
"url": { "data": "attribute", "value": "http://www.fengjr.com/financing/list?type=cx"
"comments": "本平台数据的采集URL"
}, "project": { "data": "url", "url": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ], "template": "" }, "title": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ] }, "detail": { "title": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ] }, "amount": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ] } } }, "member": { "data": "sub_item", "sub_item": { "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ], "src-save": 0, "url": { "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ], "template": "" } }, "detail": { "title": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ] }, "amount": { "data": "attribute", "matcher": [ { "match": "xpath", "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]" } ] } } }, "src-save": 1 }

 

补充:

{

  "name": "凤凰金融",

  "notice": {

    "data": "attribute",

    "matcher": [

      {

        "match": "xpath",

        "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"

      }

    ]

  },

  "url": {

    "data": "attribute",

    "value": "http://www.fengjr.com/financing/list?type=cx"

  },

  "project": {

    "data": "url",

    "url": {

      "data": "attribute",

      "matcher": [

        {

          "match": "xpath",

          "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"

        }

      ],

      "template": ""

    },

    "title": {

      "data": "attribute",

      "matcher": [

        {

          "match": "xpath",

          "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"

        }

      ]

    },

    "detail": {

      "name": "网贷列表",

      "title": {

        "data": "attribute",

        "matcher": [

          {

            "match": "xpath",

            "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"

          }

        ]

      },

      "amount": {

        "data": "attribute",

        "matcher": [

          {

            "match": "xpath",

            "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"

          }

        ]

      }

    }

  },

  "member": {

    "data": "sub_item",

    "sub_item": {

      "matcher": [

        {

          "match": "xpath",

          "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"

        }

      ],

      "src-save": 0,

      "url": {

        "data": "attribute",

        "matcher": [

          {

            "match": "xpath",

            "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"

          }

        ],

        "template": ""

      }

    },

    "detail": {

      "name": "会员材料",

      "title": {

        "data": "attribute",

        "matcher": [

          {

            "match": "xpath",

            "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"

          }

        ]

      },

      "amount": {

        "data": "attribute",

        "matcher": [

          {

            "match": "xpath",

            "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"

          }

        ]

      }

    }

  },

  "src-save": 1,

  "crawler": {

      "handler":"httpClient|selenium",
      "results":"html|json|text",
      "next_page": {
        "matcher": [
          {
            "match": "xpath",
            "pattern": "//*[@id=\"page-financing\"]/div[1]/div[5]/div/div/div[3]"
          }
         ],
        "template": ""
      },
      "history": "re-crawl|skip|stop"
    }

}

 

你可能感兴趣的:(schema)