a.业务需求
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
# Set the default spark-shell log level to ERROR. When running the spark-shell, the
# log level for this class is used to overwrite the root logger's log level, so that
# the user can have different defaults for the shell and regular Spark apps.
log4j.logger.org.apache.spark.repl.Main=INFO
# Settings to quiet third party logs that are too verbose
log4j.logger.org.spark_project.jetty=de
log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=INFO
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
log4j.logger.org.apache.parquet=INFO
log4j.logger.parquet=INFO
# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=INFO
log4j.logger.com.bigdata.spark=INFO
2.1.1
2.11.8
1.2.17
1.7.22
org.slf4j
jcl-over-slf4j
${slf4j.version}
org.slf4j
slf4j-api
${slf4j.version}
org.slf4j
slf4j-log4j12
${slf4j.version}
log4j
log4j
${log4j.version}
org.apache.spark
spark-core_2.11
${spark.version}
org.apache.spark
spark-sql_2.11
${spark.version}
org.apache.spark
spark-streaming_2.11
${spark.version}
org.scala-lang
scala-library
${scala.version}
org.apache.spark
spark-hive_2.11
${spark.version}
com.alibaba
fastjson
1.2.47
org.apache.commons
commons-pool2
2.4.2
org.apache.commons
commons-configuration2
2.2
commons-beanutils
commons-beanutils
1.9.3
mysql
mysql-connector-java
5.1.6
org.apache.kafka
kafka-clients
0.10.2.1
org.apache.spark
spark-streaming_2.11
org.apache.spark
spark-streaming-kafka-0-10_2.11
redis.clients
jedis
2.9.0
com.alibaba
druid
1.1.10
net.sf.json-lib
json-lib
2.4
org.json4s
json4s-native_2.11
3.2.11
org.json4s
json4s-jackson_2.11
3.2.11
org.apache.maven.plugins
maven-compiler-plugin
3.6.1
1.8
1.8
net.alchim31.maven
scala-maven-plugin
3.2.2
compile
testCompile
org.apache.maven.plugins
maven-assembly-plugin
3.0.0
make-assembly
package
single
4.具体代码
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
object SparkExcelReadApp {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("SparkExcelReadApp").setMaster("local[*]")
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("ERROR")
val fieldSchema = StructType(Array(
StructField("busi_key", StringType, true),
StructField("sde_names", StringType, true),
StructField("sdeid_card_name", StringType, true),
StructField("sde_tax_number", StringType, true),
StructField("sde_card_type", StringType, true),
StructField("anenterprise_name", StringType, true),
StructField("anenterprise_status", StringType, true),
StructField("anenterprise_bodytype", StringType, true),
StructField("anEnterprise_enonce_name", StringType, true)
))
val sqlContext = new SQLContext(sc)
//使用sparksql支持的库com.databricks.spark.csv
val df: DataFrame = sqlContext.read.format("com.databricks.spark.csv")
.option("header", "true")
.option("delimiter", ",")
//.option("inferSchema",true.toString)
.schema(fieldSchema)
.load("C:\\text")
df.createOrReplaceTempView("t_table_init")
val nameDF: DataFrame = sqlContext.sql("SELECT\n " +
"first(a.busi_key) as busi_key,\n " +
"first(a.sde_names) as sde_names,\n " +
"first(a.sdeid_card_name) as sdeid_card_name,\n " +
"first(a.sde_tax_number) as sde_tax_number,\n " +
"first(a.sde_card_type) as sde_card_type,\n " +
"first(a.anenterprise_name) as anenterprise_name,\n " +
"first(a.anenterprise_status) as anenterprise_status,\n " +
"first(a.anenterprise_bodytype) as anenterprise_bodytype,\n " +
"first(a.anEnterprise_enonce_name) as anEnterprise_enonce_name\n " +
" FROM\n " +
"t_table_init a GROUP BY a.anenterprise_name"
)
val lastDF: DataFrame = sqlContext.sql("SELECT" +
"\n\tbusi_key," +
"\n\tsde_names," +
"\n\tsdeid_card_name," +
"\n\tsde_tax_number," +
"\n\tsde_card_type," +
"\n\tanenterprise_name," +
"\n\tanenterprise_status," +
"\n\tanenterprise_bodytype," +
"\n\tanEnterprise_enonce_name\nFROM" +
"\n\tt_table_init\nWHERE" +
"\n\tanenterprise_name IN (" +
"\n\t\tSELECT" +
"\n\t\t\tanenterprise_name" +
"\n\t\tFROM" +
"\n\t\t\tt_table_init" +
"\n\t\tGROUP BY" +
"\n\t\t\tanenterprise_name" +
"\n\t\tHAVING" +
"\n\t\t\tCOUNT(anenterprise_name) > 1" +
"\n\t)\nORDER BY" +
"\n\tanenterprise_name")
nameDF.coalesce(1).write.option("header", "true").csv("E:\\测试Excel\\name")
lastDF.coalesce(1).write.option("header", "true") csv ("E:\\测试Excel\\out");
/*nameDF.write.option("header", "true").csv("E:\\测试Excel\\name")
lastDF.write.option("header", "true").csv("E:\\测试Excel\\out");*/
println("成功!!!!!")
}
}
e.完结
其实后面在做完这个需求后发现csv的文件特别大 业务人员的最终想法将其存放至MongoDB中,其实可以按照这个思路继续进行代码的扩展,如将数据写入到Hbase或者ES中都是可以的,spark本身也是也具有扩展性,支持多个框架的对接