平时无聊有时候看看小说,虽然说手机上的小说App很多,但是免费的多半有广告,没广告的多半都要会员,网页上面看又太麻烦,那么如何去自己从网上爬取需要的小说信息呢。
网上搜索发现爬虫多半是基于Python来实现的,本篇也是借鉴一篇基于Python实现的小说爬虫来写的,参考博文:,鉴于自己的Python只会输出hello world,我就准备使用熟悉的工具来实现爬虫
网络请求框架我使用的为volley,网页解析工具使用的为JSoup,JSoup的使用可以参考JSoup的官网
项目里面使用工具只需要在build.gradle文件中添加依赖就行
implementation 'org.jsoup:jsoup:1.13.1'
implementation 'com.android.volley:volley:1.1.1'
首先是请求网页,请求的为新笔趣阁的
var stringRequest = StringRequest(
Request.Method.GET,
"https://www.xsbiquge.com/15_15338/128.html",
{
Log.i(TAG,"the data is $it")
},
{
Log.i(TAG, "onCreate: case error when get from http")
}
).setTag(HTTP_TAG)
queue.add(stringRequest)
这样就请求到第一页的数据。但是这样请求回来的数据有问题,如图
class NormalRequestTest(
method: Int,
url: String,
listener: Response.Listener<String>,
errorListener: Response.ErrorListener
): StringRequest(method, url, listener, errorListener) {
override fun parseNetworkResponse(response: NetworkResponse): Response<String>? {
val parsed: String
parsed = try {
String(response.data).plus(HttpHeaderParser.parseCharset(response.headers, "UTF-8"))
} catch (e: UnsupportedEncodingException) {
// Since minSdkVersion = 8, we can't call
// new String(response.data, Charset.defaultCharset())
// So suppress the warning instead.
String(response.data)
}
return Response.success(parsed, HttpHeaderParser.parseCacheHeaders(response))
}
}
之后请求的数据就正常了
class SpiderManager {
private val TAG: String = "TestJSoup_SpiderManager"
private val HTTP_TAG = "story_TAG"
private val URL_START = "/15_15338/128.html" //拼接地址的后半部分
private val URL_ROOT = "https://www.xsbiquge.com" //拼接地址的前半段
private val _noteData: MutableLiveData<ArrayList<String>> = MutableLiveData()
var noteData: LiveData<ArrayList<String>> = _noteData
var urlData: ArrayList<String> = arrayListOf(URL_START) //存储地址数据
var storyData: ArrayList<String> = ArrayList() //存储小说数据
lateinit var queue: RequestQueue
var shouldStop: Boolean = true
companion object{
private var instance: SpiderManager? = null
fun getInstance(): SpiderManager?{
if (instance == null){
instance = SpiderManager()
}
return instance
}
}
fun start(context: Context){
Log.i(TAG, "start: ")
if (shouldStop){
shouldStop = false
queue = Volley.newRequestQueue(context)
getData(context, storyData.size)
}
}
fun stop(){
Log.i(TAG, "stop: ")
shouldStop = true
queue.cancelAll(HTTP_TAG)
}
fun getData(context: Context, position: Int){
Log.i(TAG, "getData: start the position is " + position + " and the url is " + urlData.get(position))
if (!shouldStop){
var stringRequest = NormalRequestTest(
Request.Method.GET,
URL_ROOT.plus(urlData[position]),
{
//Log.i(TAG, "getData: the data is $it")
var doc: Document = Jsoup.parse(it)
urlData.add(
doc.getElementsByClass("bottem2")[0].getElementsByAttribute(
"href"
)[2].attr("href")
)
storyData.add(
" ".plus(
doc.getElementById("content").text().replace(" ", "\n ")
)
)
_noteData.postValue(storyData)
if (position != 0 && position % 10 == 0) shouldStop = true //每次获取十条数据
getData(context, position + 1)
// Log.i(
// TAG,
// "getData: the size of url is " + urlData.size + " \n and the story size is " + storyData.size + "\n the new url is " +
// urlData[urlData.size - 1] + "\n and the position now is " + position
//)
},
{
Log.i(TAG, "onCreate: case error when get from http")
}
).setTag(HTTP_TAG)
queue.add(stringRequest)
}else return
}
}
最后将storyData中的数据显示在页面上就可以了
因篇幅问题不能全部显示,请点此查看更多更全内容
Copyright © 2019- igat.cn 版权所有 赣ICP备2024042791号-1
违法及侵权请联系:TEL:199 1889 7713 E-MAIL:2724546146@qq.com
本站由北京市万商天勤律师事务所王兴未律师提供法律服务