Python Code: import pandas as pd import seaborn as sns from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer from pyspark import SparkContext from pyspark.sql import SQLContext # Main entry point for DataFrame and SQL functionality. df = pd.DataFrame({ "col1": ['A']*10 + ['B']*9 + ['C']*8 + ['D']*7 + ['E']*6 + ['F']* 5 + ['G']*4 + ['H']*3 + ['I']*2 + ['J'], "col2": ['K']*10 + ['L']*9 + ['M']*8 + ['N']*7 + ['O']*6 + ['P']* 5 + ['Q']*4 + ['R']*3 + ['S']*2 + ['T'], "col3": ['U']*10 + ['V']*9 + ['W']*8 + ['X']*7 + ['Y']*6 + ['Z']* 5 + ['a']*4 + ['b']*3 + ['c']*2 + ['d'], }) sc = SparkContext.getOrCreate() sqlCtx = SQLContext(sc) sdf = sqlCtx.createDataFrame(df) # Encoding data based on Frequency-Descending order. This is the default order for PySpark's StringIndexer. indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(sdf) for column in list(sdf.columns)] df_r.show() Output: +----+----+----+----------+----------+----------+ |col1|col2|col3|col1_index|col2_index|col3_index| +----+----+----+----------+----------+----------+ | A| K| U| 0.0| 0.0| 0.0| | A| K| U| 0.0| 0.0| 0.0| | A| K| U| 0.0| 0.0| 0.0| | A| K| U| 0.0| 0.0| 0.0| | A| K| U| 0.0| 0.0| 0.0| | A| K| U| 0.0| 0.0| 0.0| | A| K| U| 0.0| 0.0| 0.0| | A| K| U| 0.0| 0.0| 0.0| | A| K| U| 0.0| 0.0| 0.0| | A| K| U| 0.0| 0.0| 0.0| | B| L| V| 1.0| 1.0| 1.0| | B| L| V| 1.0| 1.0| 1.0| | B| L| V| 1.0| 1.0| 1.0| | B| L| V| 1.0| 1.0| 1.0| | B| L| V| 1.0| 1.0| 1.0| | B| L| V| 1.0| 1.0| 1.0| | B| L| V| 1.0| 1.0| 1.0| | B| L| V| 1.0| 1.0| 1.0| | B| L| V| 1.0| 1.0| 1.0| | C| M| W| 2.0| 2.0| 2.0| +----+----+----+----------+----------+----------+ only showing top 20 rows pandas_df = df_r.toPandas() a = pd.DataFrame({'x': [0]*55, 'y': list(pandas_df.col1_index.values)}) b = pd.DataFrame({'x': list(pandas_df.col2_index.values), 'y': [0]*55}) c = pd.DataFrame({'x': list(pandas_df.col3_index.values), 'y': list(pandas_df.col3_index.values)}) d = pd.concat([a, b, c], axis = 0) sns.scatterplot(x = 'x', y = 'y', data = d, alpha = 0.3) # Encoding data based on Frequency-Ascending order. This is the default order for PySpark's StringIndexer. indexers = [StringIndexer(inputCol=column, outputCol=column+"_index", stringOrderType="frequencyAsc").fit(sdf) for column in list(sdf.columns)] pipeline = Pipeline(stages=indexers) df_r = pipeline.fit(sdf).transform(sdf) df_r.show(55) Output: +----+----+----+----------+----------+----------+ |col1|col2|col3|col1_index|col2_index|col3_index| +----+----+----+----------+----------+----------+ | A| K| U| 9.0| 9.0| 9.0| | A| K| U| 9.0| 9.0| 9.0| | A| K| U| 9.0| 9.0| 9.0| | A| K| U| 9.0| 9.0| 9.0| | A| K| U| 9.0| 9.0| 9.0| | A| K| U| 9.0| 9.0| 9.0| | A| K| U| 9.0| 9.0| 9.0| | A| K| U| 9.0| 9.0| 9.0| | A| K| U| 9.0| 9.0| 9.0| | A| K| U| 9.0| 9.0| 9.0| | B| L| V| 8.0| 8.0| 8.0| | B| L| V| 8.0| 8.0| 8.0| | B| L| V| 8.0| 8.0| 8.0| | B| L| V| 8.0| 8.0| 8.0| | B| L| V| 8.0| 8.0| 8.0| | B| L| V| 8.0| 8.0| 8.0| | B| L| V| 8.0| 8.0| 8.0| | B| L| V| 8.0| 8.0| 8.0| | B| L| V| 8.0| 8.0| 8.0| | C| M| W| 7.0| 7.0| 7.0| | C| M| W| 7.0| 7.0| 7.0| | C| M| W| 7.0| 7.0| 7.0| | C| M| W| 7.0| 7.0| 7.0| | C| M| W| 7.0| 7.0| 7.0| | C| M| W| 7.0| 7.0| 7.0| | C| M| W| 7.0| 7.0| 7.0| | C| M| W| 7.0| 7.0| 7.0| | D| N| X| 6.0| 6.0| 6.0| | D| N| X| 6.0| 6.0| 6.0| | D| N| X| 6.0| 6.0| 6.0| | D| N| X| 6.0| 6.0| 6.0| | D| N| X| 6.0| 6.0| 6.0| | D| N| X| 6.0| 6.0| 6.0| | D| N| X| 6.0| 6.0| 6.0| | E| O| Y| 5.0| 5.0| 5.0| | E| O| Y| 5.0| 5.0| 5.0| | E| O| Y| 5.0| 5.0| 5.0| | E| O| Y| 5.0| 5.0| 5.0| | E| O| Y| 5.0| 5.0| 5.0| | E| O| Y| 5.0| 5.0| 5.0| | F| P| Z| 4.0| 4.0| 4.0| | F| P| Z| 4.0| 4.0| 4.0| | F| P| Z| 4.0| 4.0| 4.0| | F| P| Z| 4.0| 4.0| 4.0| | F| P| Z| 4.0| 4.0| 4.0| | G| Q| a| 3.0| 3.0| 3.0| | G| Q| a| 3.0| 3.0| 3.0| | G| Q| a| 3.0| 3.0| 3.0| | G| Q| a| 3.0| 3.0| 3.0| | H| R| b| 2.0| 2.0| 2.0| | H| R| b| 2.0| 2.0| 2.0| | H| R| b| 2.0| 2.0| 2.0| | I| S| c| 1.0| 1.0| 1.0| | I| S| c| 1.0| 1.0| 1.0| | J| T| d| 0.0| 0.0| 0.0| +----+----+----+----------+----------+----------+ pandas_df = df_r.toPandas() a = pd.DataFrame({'x': [0]*55, 'y': list(pandas_df.col1_index.values)}) b = pd.DataFrame({'x': list(pandas_df.col2_index.values), 'y': [0]*55}) c = pd.DataFrame({'x': list(pandas_df.col3_index.values), 'y': list(pandas_df.col3_index.values)}) d = pd.concat([a, b, c], axis = 0) sns.scatterplot(x = 'x', y = 'y', data = d, alpha = 0.3) Conclusion If data is encoded based on 'Frequency-Descending' order, the data gets clustered near the origin. And if data is encoded based on 'Frequency-Aescending' order, the data is clustered away from the origin. In this case, the number of cluster might come out to be more than the number of clusters that will form in 'Frequency-Descending' order encoded data.
Effect of PySpark's StringIndexer on clustering of data
Subscribe to:
Posts (Atom)
No comments:
Post a Comment