Skip to content

Commit a15e6d3

Browse files
committed
fix style
1 parent d29dd53 commit a15e6d3

File tree

9 files changed

+123
-54
lines changed

9 files changed

+123
-54
lines changed

.scalafmt.conf

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
maxColumn = 120
2+
docstrings = ScalaDoc
3+
continuationIndent.callSite = 2
4+
align.openParenCallSite = false
5+
align.openParenDefnSite = false
6+
align.arrowEnumeratorGenerator = false
7+
align.ifWhileOpenParen = false
8+
align.tokens = []
9+
newlines.alwaysBeforeTopLevelStatements = false
10+
newlines.alwaysBeforeElseAfterCurlyIf = false
11+
rewrite.rules = [ExpandImportSelectors, AvoidInfix, RedundantBraces, RedundantParens, SortModifiers]
12+
rewrite.redundantBraces.stringInterpolation = true
13+
rewrite.redundantBraces.includeUnitMethods = true
14+
continuationIndent.defnSite = 2
15+
includeCurlyBraceInSelectChains = true
16+
optIn.breakChainOnFirstMethodDot = true

master/src/main/scala/com/github/jaitl/crawler/master/queue/QueueTaskBalancer.scala

+23-7
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,12 @@ import com.github.jaitl.crawler.models.worker.WorkerManager.TasksBatchProcessRes
1212
import scala.util.Random
1313

1414
class QueueTaskBalancer(
15-
queueTaskQueueReqCtrl: ActorRef,
16-
queueTaskQueueResCtrl: ActorRef
17-
) extends Actor with ActorLogging {
15+
queueTaskQueueReqCtrl: ActorRef,
16+
queueTaskQueueResCtrl: ActorRef
17+
) extends Actor
18+
with ActorLogging {
19+
20+
// scalastyle:off method.length
1821
override def receive: Receive = {
1922
case RequestTasksBatch(requestId, taskTypes) if taskTypes.nonEmpty =>
2023
log.debug(s"RequestTasksBatch, requestId: $requestId, types: $taskTypes")
@@ -23,10 +26,18 @@ class QueueTaskBalancer(
2326
sender() ! EmptyTaskTypeList
2427
} else if (taskTypes.lengthCompare(1) == 0) {
2528
val task = taskTypes.head
26-
queueTaskQueueReqCtrl ! QueueTaskRequestController.RequestTask(requestId, task.taskType, task.batchSize, sender())
29+
queueTaskQueueReqCtrl ! QueueTaskRequestController.RequestTask(
30+
requestId,
31+
task.taskType,
32+
task.batchSize,
33+
sender())
2734
} else {
2835
val task = Random.shuffle(taskTypes.toIndexedSeq).head
29-
queueTaskQueueReqCtrl ! QueueTaskRequestController.RequestTask(requestId, task.taskType, task.batchSize, sender())
36+
queueTaskQueueReqCtrl ! QueueTaskRequestController.RequestTask(
37+
requestId,
38+
task.taskType,
39+
task.batchSize,
40+
sender())
3041
}
3142

3243
case TasksBatchProcessResult(requestId, taskType, successIds, failureIds, skippedIds, bannedIds, newTasks) =>
@@ -45,8 +56,13 @@ class QueueTaskBalancer(
4556
queueTaskQueueResCtrl ! QueueTaskResultController.MarkAsProcessed(requestId, taskType, bannedIds, sender())
4657
}
4758
if (newTasks.nonEmpty) {
48-
newTasks.foreach { case (newTaskType, newTasksData) =>
49-
queueTaskQueueResCtrl ! QueueTaskResultController.AddNewTasks(requestId, newTaskType, newTasksData, sender())
59+
newTasks.foreach {
60+
case (newTaskType, newTasksData) =>
61+
queueTaskQueueResCtrl ! QueueTaskResultController.AddNewTasks(
62+
requestId,
63+
newTaskType,
64+
newTasksData,
65+
sender())
5066
}
5167
}
5268

scalastyle_config.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
<check level="warning" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
1212
<check level="warning" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
1313
<parameters>
14-
<parameter name="maxLineLength"><![CDATA[160]]></parameter>
14+
<parameter name="maxLineLength"><![CDATA[120]]></parameter>
1515
<parameter name="tabSize"><![CDATA[4]]></parameter>
1616
</parameters>
1717
</check>

worker/src/main/scala/com/github/jaitl/crawler/worker/WorkerManager.scala

+2-2
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ private[worker] class WorkerManager(
5656
batchControllers += tasksBatchController -> TaskBatchContext(tasksBatchController, Instant.now(), taskType)
5757
context.watch(tasksBatchController)
5858
tasksBatchController ! TasksBatchController.ExecuteTask
59-
log.info(
60-
s"SuccessTasksBatchRequest size: ${context.children.size} batchControllers: ${batchControllers.size}, id: $requestId")
59+
log.info(s"SuccessTasksBatchRequest size: ${context.children.size} " +
60+
s"batchControllers: ${batchControllers.size}, id: $requestId")
6161
case FailureTasksBatchRequest(requestId, taskType, throwable) =>
6262
case NoTasks(requestId, taskType) =>
6363
case EmptyTaskTypeList =>

worker/src/main/scala/com/github/jaitl/crawler/worker/executor/SaveCrawlResultController.scala

+54-30
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,16 @@ import com.github.jaitl.crawler.models.task.Task
1313
import com.github.jaitl.crawler.models.worker.WorkerManager.TasksBatchProcessResult
1414
import com.github.jaitl.crawler.worker.crawler.CrawlResult
1515
import com.github.jaitl.crawler.worker.creator.TwoArgumentActorCreator
16-
import com.github.jaitl.crawler.worker.executor.SaveCrawlResultController.{AddResults, BannedTask, FailedTask, FailureSaveResults, SaveCrawlResultControllerConfig, SaveResults, SkippedTask, SuccessAddedResults, SuccessCrawledTask, SuccessSavedResults}
16+
import com.github.jaitl.crawler.worker.executor.SaveCrawlResultController.AddResults
17+
import com.github.jaitl.crawler.worker.executor.SaveCrawlResultController.BannedTask
18+
import com.github.jaitl.crawler.worker.executor.SaveCrawlResultController.FailedTask
19+
import com.github.jaitl.crawler.worker.executor.SaveCrawlResultController.FailureSaveResults
20+
import com.github.jaitl.crawler.worker.executor.SaveCrawlResultController.SaveCrawlResultControllerConfig
21+
import com.github.jaitl.crawler.worker.executor.SaveCrawlResultController.SaveResults
22+
import com.github.jaitl.crawler.worker.executor.SaveCrawlResultController.SkippedTask
23+
import com.github.jaitl.crawler.worker.executor.SaveCrawlResultController.SuccessAddedResults
24+
import com.github.jaitl.crawler.worker.executor.SaveCrawlResultController.SuccessCrawledTask
25+
import com.github.jaitl.crawler.worker.executor.SaveCrawlResultController.SuccessSavedResults
1726
import com.github.jaitl.crawler.worker.parser.ParseResult
1827
import com.github.jaitl.crawler.worker.pipeline.Pipeline
1928
import com.github.jaitl.crawler.worker.scheduler.Scheduler
@@ -28,13 +37,17 @@ class SaveCrawlResultController[T](
2837
queueTaskBalancer: ActorRef,
2938
tasksBatchController: ActorRef,
3039
saveScheduler: Scheduler,
31-
config: SaveCrawlResultControllerConfig
32-
) extends Actor with ActorLogging with Stash {
33-
private implicit val executionContext: ExecutionContext = context.dispatcher
34-
35-
var successTasks: mutable.Seq[SuccessCrawledTask] = mutable.ArraySeq.empty[SuccessCrawledTask]
40+
config: SaveCrawlResultControllerConfig)
41+
extends Actor
42+
with ActorLogging
43+
with Stash {
44+
implicit private val executionContext: ExecutionContext = context.dispatcher
45+
46+
var successTasks: mutable.Seq[SuccessCrawledTask] =
47+
mutable.ArraySeq.empty[SuccessCrawledTask]
3648
var failedTasks: mutable.Seq[FailedTask] = mutable.ArraySeq.empty[FailedTask]
37-
var skippedTasks: mutable.Seq[SkippedTask] = mutable.ArraySeq.empty[SkippedTask]
49+
var skippedTasks: mutable.Seq[SkippedTask] =
50+
mutable.ArraySeq.empty[SkippedTask]
3851
var bannedTasks: mutable.Seq[BannedTask] = mutable.ArraySeq.empty[BannedTask]
3952

4053
override def preStart(): Unit = {
@@ -43,7 +56,7 @@ class SaveCrawlResultController[T](
4356
saveScheduler.schedule(config.saveInterval, self, SaveResults)
4457
}
4558

46-
override def receive: Receive = addResultHandler orElse waitSave
59+
override def receive: Receive = addResultHandler.orElse(waitSave)
4760

4861
private def addResultHandler: Receive = {
4962
case AddResults(result) =>
@@ -73,7 +86,8 @@ class SaveCrawlResultController[T](
7386
case SaveResults =>
7487
context.become(saveResultHandler)
7588

76-
val parserResults = successTasks.flatMap(_.parseResult).map(_.parsedData.asInstanceOf[T])
89+
val parserResults =
90+
successTasks.flatMap(_.parseResult).map(_.parsedData.asInstanceOf[T])
7791
val rawResult = successTasks.map(r => (r.task, r.crawlResult))
7892

7993
val saveFuture: Future[SaveResults] = for {
@@ -91,7 +105,7 @@ class SaveCrawlResultController[T](
91105
case ex: Exception => FailureSaveResults(ex)
92106
}
93107

94-
recoveredSaveFuture pipeTo self
108+
recoveredSaveFuture.pipeTo(self)
95109
}
96110

97111
private def saveResultHandler: Receive = {
@@ -103,8 +117,11 @@ class SaveCrawlResultController[T](
103117
val failureIds = failedTasks.map(_.task.id)
104118
val skippedIds = skippedTasks.map(_.task.id)
105119
val bannedIds = bannedTasks.map(_.task.id)
106-
val newCrawlTasks = successTasks.flatMap(_.parseResult.map(_.newCrawlTasks).getOrElse(Seq.empty))
107-
val newTasks = newCrawlTasks.groupBy(_.taskType)
120+
val newCrawlTasks = successTasks.flatMap(
121+
_.parseResult.map(_.newCrawlTasks).getOrElse(Seq.empty)
122+
)
123+
val newTasks = newCrawlTasks
124+
.groupBy(_.taskType)
108125
.map {
109126
case (taskType, vals) =>
110127
val newTasks = vals.flatMap(_.tasks).distinct
@@ -150,7 +167,11 @@ object SaveCrawlResultController {
150167
case object SuccessAddedResults
151168

152169
trait CrawlTaskResult
153-
case class SuccessCrawledTask(task: Task, crawlResult: CrawlResult, parseResult: Option[ParseResult[_]]) extends CrawlTaskResult
170+
case class SuccessCrawledTask(
171+
task: Task,
172+
crawlResult: CrawlResult,
173+
parseResult: Option[ParseResult[_]]
174+
) extends CrawlTaskResult
154175
case class FailedTask(task: Task, t: Throwable) extends CrawlTaskResult
155176
case class SkippedTask(task: Task, t: Throwable) extends CrawlTaskResult
156177
case class BannedTask(task: Task, t: Throwable) extends CrawlTaskResult
@@ -162,14 +183,16 @@ object SaveCrawlResultController {
162183
queueTaskBalancer: ActorRef,
163184
tasksBatchController: ActorRef,
164185
saveScheduler: Scheduler,
165-
config: SaveCrawlResultControllerConfig
166-
): Props = Props(new SaveCrawlResultController(
167-
pipeline = pipeline,
168-
queueTaskBalancer = queueTaskBalancer,
169-
tasksBatchController = tasksBatchController,
170-
saveScheduler = saveScheduler,
171-
config = config
172-
))
186+
config: SaveCrawlResultControllerConfig): Props =
187+
Props(
188+
new SaveCrawlResultController(
189+
pipeline = pipeline,
190+
queueTaskBalancer = queueTaskBalancer,
191+
tasksBatchController = tasksBatchController,
192+
saveScheduler = saveScheduler,
193+
config = config
194+
)
195+
)
173196

174197
def name(): String = "saveCrawlResultController"
175198
}
@@ -179,16 +202,17 @@ private[worker] class SaveCrawlResultControllerCreator(
179202
saveScheduler: Scheduler,
180203
config: SaveCrawlResultControllerConfig
181204
) extends TwoArgumentActorCreator[Pipeline[_], ActorRef] {
182-
override def create(factory: ActorRefFactory, firstArg: Pipeline[_], secondArg: ActorRef): ActorRef = {
205+
override def create(factory: ActorRefFactory, firstArg: Pipeline[_], secondArg: ActorRef): ActorRef =
183206
factory.actorOf(
184-
props = SaveCrawlResultController.props(
185-
pipeline = firstArg,
186-
queueTaskBalancer = queueTaskBalancer,
187-
tasksBatchController = secondArg,
188-
saveScheduler = saveScheduler,
189-
config = config
190-
).withDispatcher("worker.blocking-io-dispatcher"),
207+
props = SaveCrawlResultController
208+
.props(
209+
pipeline = firstArg,
210+
queueTaskBalancer = queueTaskBalancer,
211+
tasksBatchController = secondArg,
212+
saveScheduler = saveScheduler,
213+
config = config
214+
)
215+
.withDispatcher("worker.blocking-io-dispatcher"),
191216
name = SaveCrawlResultController.name()
192217
)
193-
}
194218
}

worker/src/main/scala/com/github/jaitl/crawler/worker/executor/resource/ProxyResourceController.scala

+9-2
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,18 @@ import akka.actor.ActorLogging
88
import akka.actor.Props
99
import com.github.jaitl.crawler.worker.executor.resource.ProxyResourceController.ExecutorContext
1010
import com.github.jaitl.crawler.worker.executor.resource.ProxyResourceController.ProxyConfig
11-
import com.github.jaitl.crawler.worker.executor.resource.ResourceController.{NoFreeResource, NoResourcesAvailable, RequestResource, ReturnBannedResource, ReturnFailedResource, ReturnSkippedResource, ReturnSuccessResource, SuccessRequestResource}
12-
import com.github.jaitl.crawler.worker.http.agent.UserAgentGenerator
11+
import com.github.jaitl.crawler.worker.executor.resource.ResourceController.NoFreeResource
12+
import com.github.jaitl.crawler.worker.executor.resource.ResourceController.NoResourcesAvailable
13+
import com.github.jaitl.crawler.worker.executor.resource.ResourceController.RequestResource
14+
import com.github.jaitl.crawler.worker.executor.resource.ResourceController.ReturnBannedResource
15+
import com.github.jaitl.crawler.worker.executor.resource.ResourceController.ReturnFailedResource
16+
import com.github.jaitl.crawler.worker.executor.resource.ResourceController.ReturnSkippedResource
17+
import com.github.jaitl.crawler.worker.executor.resource.ResourceController.ReturnSuccessResource
18+
import com.github.jaitl.crawler.worker.executor.resource.ResourceController.SuccessRequestResource
1319
import com.github.jaitl.crawler.worker.http.HttpRequestExecutor
1420
import com.github.jaitl.crawler.worker.http.HttpRequestExecutorConfig
1521
import com.github.jaitl.crawler.worker.http.ProxyType
22+
import com.github.jaitl.crawler.worker.http.agent.UserAgentGenerator
1623
import com.github.jaitl.crawler.worker.timeout.RandomTimeout
1724

1825
import scala.collection.mutable

worker/src/main/scala/com/github/jaitl/crawler/worker/http/AsyncHttpRequestExecutor.scala

+3-1
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@ import java.util.UUID
44

55
import com.typesafe.scalalogging.StrictLogging
66
import io.netty.handler.codec.http.HttpHeaderNames
7-
import org.asynchttpclient.{AsyncHttpClient, BoundRequestBuilder, Realm}
87
import org.asynchttpclient.Dsl.asyncHttpClient
98
import org.asynchttpclient.Dsl.config
109
import org.asynchttpclient.proxy.ProxyServer
10+
import org.asynchttpclient.AsyncHttpClient
11+
import org.asynchttpclient.BoundRequestBuilder
12+
import org.asynchttpclient.Realm
1113

1214
import scala.compat.java8.FutureConverters.toScala
1315
import scala.concurrent.ExecutionContext

worker/src/main/scala/com/github/jaitl/crawler/worker/save/ElasticSearchSaveParsedProvider.scala

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
package com.github.jaitl.crawler.worker.save
22

3-
import com.sksamuel.elastic4s.http._
3+
import com.sksamuel.elastic4s.http.ElasticClient
4+
import com.sksamuel.elastic4s.http.ElasticProperties
45

56
import scala.concurrent.ExecutionContext
67
import scala.concurrent.Future
7-
import com.sksamuel.elastic4s.http.ElasticDsl._
88

99

1010
class ElasticSearchSaveParsedProvider[T](
@@ -13,6 +13,8 @@ class ElasticSearchSaveParsedProvider[T](
1313
port: Int,
1414
clusername: String
1515
)(implicit converter: ElasticSearchTypeConverter[T]) extends SaveParsedProvider[T] {
16+
import com.sksamuel.elastic4s.http.ElasticDsl._ // scalastyle:off
17+
1618
val client = ElasticClient(ElasticProperties(s"http://$server:$port?cluster.name=$clusername"))
1719

1820
override def saveResults(parsedData: Seq[T])(implicit executionContext: ExecutionContext): Future[Unit] = Future {

worker/src/main/scala/com/github/jaitl/crawler/worker/save/S3SaveRawProvider.scala

+11-9
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package com.github.jaitl.crawler.worker.save
22

3-
import java.io.{BufferedWriter, File, FileWriter}
3+
import java.io.BufferedWriter
4+
import java.io.File
5+
import java.io.FileWriter
46

57
import com.github.jaitl.crawler.models.task.Task
68
import com.github.jaitl.crawler.worker.crawler.CrawlResult
@@ -12,12 +14,13 @@ import scala.concurrent.ExecutionContext.Implicits.global
1214
import scala.concurrent.Future
1315

1416
class S3SaveRawProvider(
15-
val accessKey: String,
16-
val secretKey: String,
17-
val bucketName: String,
18-
val path: String = "",
19-
val endpoint: String = "https://s3-us-west-1.amazonaws.com"
20-
) extends SaveRawProvider with StrictLogging {
17+
val accessKey: String,
18+
val secretKey: String,
19+
val bucketName: String,
20+
val path: String = "",
21+
val endpoint: String = "https://s3-us-west-1.amazonaws.com"
22+
) extends SaveRawProvider
23+
with StrictLogging {
2124
val credentials = new BasicAWSCredentials(accessKey, secretKey)
2225
val client = new AmazonS3Client(credentials)
2326

@@ -28,8 +31,7 @@ class S3SaveRawProvider(
2831
val bw = new BufferedWriter(new FileWriter(file))
2932
bw.write(r._2.data)
3033
bw.close()
31-
client.putObject(bucketName,
32-
r._1.taskType.concat("/").concat(r._1.id).concat("/").concat(r._1.taskData), file)
34+
client.putObject(bucketName, r._1.taskType.concat("/").concat(r._1.id).concat("/").concat(r._1.taskData), file)
3335
logger.debug(s"Saving crawler result to: $fileName")
3436
})
3537
}

0 commit comments

Comments
 (0)