down vote
favorite
Community,
I have written the following pyspark.sql query. However, pyspark doesn't appear to recognize the SQL query 'TOP 20 PERCENT'.
Can you let me know if I have to reformat the number '20'
Any thoughts will be greatly appreciated.
Carlton
Hello Community,
Can someone let me know how to get email notifications when someone has replied to my question?
For some reason email notifications aren't coming through
favorite
Community,
I have written the following pyspark.sql query. However, pyspark doesn't appear to recognize the SQL query 'TOP 20 PERCENT'.
#%% import findspark findspark.init('/home/packt/spark-2.1.0-bin-hadoop2.7') from pyspark.sql import SparkSession spark = SparkSession.builder.appName('ops').getOrCreate() df = spark.read.csv('/home/packt/Downloads/Spark_DataFrames/HumanResources_Employee.csv',inferSchema=True,header=True) df.createOrReplaceTempView('HumanResources_Employee') myresults = spark.sql("""SELECT TOP 20 PERCENT NationalIDNumber ,JobTitle ,BirthDate FROM HumanResources_Employee""") myresults.show()As you can see from the results below, pyspark isn't able to recognize the number '20'
Can you let me know if I have to reformat the number '20'
Error:Append ResultsClear Results
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
~/spark-2.1.0-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
~/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
Py4JJavaError: An error occurred while calling o19.sql.
: org.apache.spark.sql.catalyst.parser.ParseException:
extraneous input '20' expecting {<EOF>, '(', ',', '.', '[', 'SELECT', 'FROM', 'ADD', 'AS', 'ALL', 'DISTINCT', 'WHERE', 'GROUP', 'BY', 'GROUPING', 'SETS', 'CUBE', 'ROLLUP', 'ORDER', 'HAVING', 'LIMIT', 'AT', 'OR', 'AND', 'IN', NOT, 'NO', 'EXISTS', 'BETWEEN', 'LIKE', RLIKE, 'IS', 'NULL', 'TRUE', 'FALSE', 'NULLS', 'ASC', 'DESC', 'FOR', 'INTERVAL', 'CASE', 'WHEN', 'THEN', 'ELSE', 'END', 'JOIN', 'CROSS', 'OUTER', 'INNER', 'LEFT', 'SEMI', 'RIGHT', 'FULL', 'NATURAL', 'ON', 'LATERAL', 'WINDOW', 'OVER', 'PARTITION', 'RANGE', 'ROWS', 'UNBOUNDED', 'PRECEDING', 'FOLLOWING', 'CURRENT', 'FIRST', 'LAST', 'ROW', 'WITH', 'VALUES', 'CREATE', 'TABLE', 'VIEW', 'REPLACE', 'INSERT', 'DELETE', 'INTO', 'DESCRIBE', 'EXPLAIN', 'FORMAT', 'LOGICAL', 'CODEGEN', 'CAST', 'SHOW', 'TABLES', 'COLUMNS', 'COLUMN', 'USE', 'PARTITIONS', 'FUNCTIONS', 'DROP', 'UNION', 'EXCEPT', 'MINUS', 'INTERSECT', 'TO', 'TABLESAMPLE', 'STRATIFY', 'ALTER', 'RENAME', 'ARRAY', 'MAP', 'STRUCT', 'COMMENT', 'SET', 'RESET', 'DATA', 'START', 'TRANSACTION', 'COMMIT', 'ROLLBACK', 'MACRO', 'IF', EQ, '<=>', '<>', '!=', '<', LTE, '>', GTE, '+', '-', '*', '/', '%', 'DIV', '&', '|', '^', 'PERCENT', 'BUCKET', 'OUT', 'OF', 'SORT', 'CLUSTER', 'DISTRIBUTE', 'OVERWRITE', 'TRANSFORM', 'REDUCE', 'USING', 'SERDE', 'SERDEPROPERTIES', 'RECORDREADER', 'RECORDWRITER', 'DELIMITED', 'FIELDS', 'TERMINATED', 'COLLECTION', 'ITEMS', 'KEYS', 'ESCAPED', 'LINES', 'SEPARATED', 'FUNCTION', 'EXTENDED', 'REFRESH', 'CLEAR', 'CACHE', 'UNCACHE', 'LAZY', 'FORMATTED', 'GLOBAL', TEMPORARY, 'OPTIONS', 'UNSET', 'TBLPROPERTIES', 'DBPROPERTIES', 'BUCKETS', 'SKEWED', 'STORED', 'DIRECTORIES', 'LOCATION', 'EXCHANGE', 'ARCHIVE', 'UNARCHIVE', 'FILEFORMAT', 'TOUCH', 'COMPACT', 'CONCATENATE', 'CHANGE', 'CASCADE', 'RESTRICT', 'CLUSTERED', 'SORTED', 'PURGE', 'INPUTFORMAT', 'OUTPUTFORMAT', DATABASE, DATABASES, 'DFS', 'TRUNCATE', 'ANALYZE', 'COMPUTE', 'LIST', 'STATISTICS', 'PARTITIONED', 'EXTERNAL', 'DEFINED', 'REVOKE', 'GRANT', 'LOCK', 'UNLOCK', 'MSCK', 'REPAIR', 'RECOVER', 'EXPORT', 'IMPORT', 'LOAD', 'ROLE', 'ROLES', 'COMPACTIONS', 'PRINCIPALS', 'TRANSACTIONS', 'INDEX', 'INDEXES', 'LOCKS', 'OPTION', 'ANTI', 'LOCAL', 'INPATH', 'CURRENT_DATE', 'CURRENT_TIMESTAMP', IDENTIFIER, BACKQUOTED_IDENTIFIER}(line 1, pos 11)
== SQL ==
SELECT TOP 20 PERCENT
-----------^^^
NationalIDNumber
,JobTitle
,BirthDate
FROM HumanResources_Employee
at org.apache.spark.sql.catalyst.parser.ParseException.withCommand(ParseDriver.scala:197)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parse(ParseDriver.scala:99)
at org.apache.spark.sql.execution.SparkSqlParser.parse(SparkSqlParser.scala:45)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parsePlan(ParseDriver.scala:53)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:592)
at sun.reflect.GeneratedMethodAccessor69.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
During handling of the above exception, another exception occurred:
ParseException Traceback (most recent call last)
<ipython-input-74-b6642cf94ff4> in <module>()
10 ,JobTitle
11 ,BirthDate
---> 12 FROM HumanResources_Employee""")
13 myresults.show()
~/spark-2.1.0-bin-hadoop2.7/python/pyspark/sql/session.py in sql(self, sqlQuery)
539 [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
540 """
--> 541 return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
542
543 @since(2.0)
~/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
~/spark-2.1.0-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
71 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
72 if s.startswith('org.apache.spark.sql.catalyst.parser.ParseException: '):
---> 73 raise ParseException(s.split(': ', 1)[1], stackTrace)
74 if s.startswith('org.apache.spark.sql.streaming.StreamingQueryException: '):
75 raise StreamingQueryException(s.split(': ', 1)[1], stackTrace)
As you can see from above the problem appear to be with the number 20. Any thoughts will be greatly appreciated.
Carlton
Hello Community,
Can someone let me know how to get email notifications when someone has replied to my question?
For some reason email notifications aren't coming through