Traitement de données massives
John Samuel
                CPE Lyon
                Année: 2022-2023
                Courriel: john(dot)samuel(at)cpe(dot)fr
                
                 
            
FROM ubuntu:latest
MAINTAINER John Samuel
RUN apt -y update && \
    apt -y upgrade && \
    apt -y install apache2 git
RUN git clone https://github.com/johnsamuelwrites/johnsamuelwrites.github.io
RUN rm -rf /var/www/html
RUN mv johnsamuelwrites.github.io /var/www/html
RUN echo "ServerName localhost" >>/etc/apache2/apache2.conf
EXPOSE 80
CMD apachectl -D FOREGROUND
			
        Construction de l'image docker
docker build -t johnsamuel .
Exécution de l'image docker
docker run -dit -p 8080:80 johnsamuel`
Voir le lien: http://localhost:8080/
Reconstruction de l'image docker
docker build --no-cache -t johnsamuel .
 
                 
                
        $  head /home/john/Downloads/query.csv
             itemLabel,year
             Amiga E,1993
             Embarcadero Delphi,1995
             Sather,1990
             Microsoft Small Basic,2008
             Squeak,1996
             AutoIt,1999
             Eiffel,1985
             Eiffel,1986
             Kent Recursive Calculator,1981
			
            
                          $ export HADOOP_HOME="..."
                          $ ./hive
                          hive> set hive.metastore.warehouse.dir=${env:HOME}/hive/warehouse;
			
        
                          $./hive
                          hive> set hive.metastore.warehouse.dir=${env:HOME}/hive/warehouse;
                          hive> create database mydb;
                          hive> use mydb;
			
        
        $./hive
        hive> use mydb;
        hive> CREATE TABLE IF NOT EXISTS
             proglang (name String, year int)
             COMMENT "Programming Languages"
             ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
             LINES TERMINATED BY '\n'
             STORED AS TEXTFILE;
        hive> LOAD DATA LOCAL INPATH '/home/john/Downloads/query.csv'
             OVERWRITE INTO TABLE proglang;
			
            
        $./hive
        hive> SELECT * from proglang;
        hive> SELECT * from proglang where year > 1980;
			
        
        $./hive
        hive> DELETE from proglang where year=1980;
	FAILED: SemanticException [Error 10294]: Attempt to do update
	  or delete using transaction manager that does not support these operations.
			
        
        $./hive
        hive> set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
        hive> DELETE from proglang where year=1980;
	FAILED: RuntimeException [Error 10264]: To use
	  DbTxnManager you must set hive.support.concurrency=true
        hive> set hive.support.concurrency=true;
        hive> DELETE from proglang where year=1980;
	FAILED: SemanticException [Error 10297]: Attempt to do update
	  or delete on table mydb.proglang that is not transactional
        hive> ALTER TABLE proglang set TBLPROPERTIES ('transactional'='true') ;
	FAILED: Execution Error, return code 1 from i
          org.apache.hadoop.hive.ql.exec.DDLTask. Unable to alter table.
          The table must be stored using an ACID compliant format
	  (such as ORC): mydb.proglang
			
        
        $./hive
        hive> use mydb;
        hive> CREATE TABLE IF NOT EXISTS
             proglangorc (name String, year int)
             COMMENT "Programming Languages"
             ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
             LINES TERMINATED BY '\n'
             STORED AS ORC;
        hive> LOAD DATA LOCAL INPATH '/home/john/Downloads/query.csv'
             OVERWRITE INTO TABLE proglangorc;
	FAILED: SemanticException Unable to load data to destination table.
          Error: The file that you are trying to load does not match
	   the file format of the destination table.
			
            
        $./hive
        hive> insert overwrite table proglangorc select * from proglang;
        hive> DELETE from proglangorc where year=1980;
	FAILED: SemanticException [Error 10297]: Attempt to do update
	  or delete on table mydb.proglangorc that is not transactional
        hive> ALTER TABLE proglangorc set TBLPROPERTIES ('transactional'='true') ;
        hive> DELETE from proglangorc where year=1980;
        hive> SELECT count(*) from proglangorc;
        hive> SELECT count(*) from proglangorc where year=1980;
			
             
                
                          $./pyspark
   >>> lines = sc.textFile("/home/john/Downloads/query.csv")
   >>> lineLengths = lines.map(lambda s: len(s))
   >>> totalLength = lineLengths.reduce(lambda a, b: a + b)
   >>> print(totalLength)
			
         
                
                          $./pyspark
   >>> lines = sc.textFile("/home/john/Downloads/query.csv")
   >>> lineWordCount = lines.map(lambda s: len(s.split()))
   >>> totalWords = lineWordCount.reduce(lambda a, b: a + b)
   >>> print(totalWords)
			
         
                
                          $ export SPARK_HOME='.../spark/spark-x.x.x-bin-hadoopx.x/bin
			  $ export PYSPARK_PYTHON=/usr/bin/python3
			  $ export PYSPARK_DRIVER_PYTHON=jupyter
			  $ export PYSPARK_DRIVER_PYTHON_OPTS='notebook'
			  $ ./pyspark
			
         
                 
                 
                
			from pyspark.sql import HiveContext
                        sqlContext = HiveContext(sc)
                        sqlContext.sql("use default")
                        sqlContext.sql("show tables").show()
			
        
			+--------+---------+-----------+
                        |database|tableName|isTemporary|
                        +--------+---------+-----------+
                        | default| proglang|      false|
                        | default|proglang2|      false|
                        +--------+---------+-----------+
			
        
			result = sqlContext.sql("SELECT count(*) FROM proglang ")
			result.show()
			+--------+
                        |count(1)|
                        +--------+
                        |     611|
                        +--------+
			
        print(type(result))
<class 'pyspark.sql.dataframe.DataFrame'>
			import pandas as pd
			result = sqlContext.sql("SELECT count(*) as count FROM proglang ")
			resultFrame = result.toPandas()
			print(resultFrame)
			
            
                        |count|
                        +-----+
                        |  611|
			
        
			import pandas as pd
			result = sqlContext.sql("SELECT * FROM proglang ")
                        resultFrame = result.toPandas()
                        groups = resultFrame.groupby('year').count()
                        print(groups)
			
        
			import nltk
                        nltk.download('vader_lexicon')
			
        
			from nltk.sentiment.vader import SentimentIntensityAnalyzer
                        sia = SentimentIntensityAnalyzer()
                        sentiment = sia.polarity_scores("this movie is good")
                        print(sentiment)
                        sentiment = sia.polarity_scores("this movie is not very good")
                        print(sentiment)
                        sentiment = sia.polarity_scores("this movie is bad")
                        print(sentiment)
			
        
			{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}
                        {'neg': 0.344, 'neu': 0.656, 'pos': 0.0, 'compound': -0.3865}
                        {'neg': 0.538, 'neu': 0.462, 'pos': 0.0, 'compound': -0.5423}