This file is indexed.

/usr/lib/python3/dist-packages/pandas/io/gbq.py is in python3-pandas 0.14.1-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
from datetime import datetime
import json
import logging
import sys
from time import sleep
import uuid

import numpy as np
import pkg_resources

from distutils.version import LooseVersion
from pandas import compat
from pandas.core.api import DataFrame
from pandas.tools.merge import concat
from pandas.core.common import PandasError


_GOOGLE_API_CLIENT_INSTALLED = False
_GOOGLE_API_CLIENT_VALID_VERSION = False
_GOOGLE_FLAGS_INSTALLED = False
_GOOGLE_FLAGS_VALID_VERSION = False
_HTTPLIB2_INSTALLED = False

if not compat.PY3:
    
    try:
        from apiclient.discovery import build
        from apiclient.http import MediaFileUpload
        from apiclient.errors import HttpError

        from oauth2client.client import OAuth2WebServerFlow
        from oauth2client.client import AccessTokenRefreshError
        from oauth2client.client import flow_from_clientsecrets
        from oauth2client.file import Storage
        from oauth2client.tools import run
        _GOOGLE_API_CLIENT_INSTALLED=True
        _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution('google-api-python-client').version

        if LooseVersion(_GOOGLE_API_CLIENT_VERSION >= '1.2.0'):
            _GOOGLE_API_CLIENT_VALID_VERSION = True

    except ImportError:
        _GOOGLE_API_CLIENT_INSTALLED = False


    try:
        import gflags as flags
        _GOOGLE_FLAGS_INSTALLED = True

        _GOOGLE_FLAGS_VERSION = pkg_resources.get_distribution('python-gflags').version

        if LooseVersion(_GOOGLE_FLAGS_VERSION >= '2.0.0'):
            _GOOGLE_FLAGS_VALID_VERSION = True

    except ImportError:
        _GOOGLE_FLAGS_INSTALLED = False

    try:
        import httplib2
        _HTTPLIB2_INSTALLED = True
    except ImportError:
        _HTTPLIB2_INSTALLED = False


logger = logging.getLogger('pandas.io.gbq')
logger.setLevel(logging.ERROR)

class InvalidPageToken(PandasError, IOError):
    """
    Raised when Google BigQuery fails to return,
    or returns a duplicate page token.
    """
    pass

class InvalidQueryException(PandasError, IOError):
    """
    Raised when a malformed query is given to read_gbq.
    """
    pass

class AccessDeniedException(PandasError, IOError):
    """
    Raised when invalid credentials are provided, or tokens have expired.
    """
    pass

class NotFoundException(PandasError, IOError):
    """
    Raised when the project_id/table provided in the query could not be found.
    """
    pass

class TermsOfServiceNotAcceptedException(PandasError, IOError):
    """
    Raised when the terms of service were not accepted or have been unaccepted.
    """
    pass

class UnknownGBQException(PandasError, IOError):
    """
    Raised when an unrecognized Google API Error occurs.
    """
    pass


class InvalidColumnOrder(PandasError, IOError):
    """
    Raised when the provided column order for output
    results DataFrame does not match the schema
    returned by BigQuery.
    """
    pass

class GbqConnector:
    def __init__(self, project_id, reauth=False):
        self.project_id     = project_id
        self.reauth         = reauth
        self.credentials    = self.get_credentials()
        self.service        = self.get_service(self.credentials)

    def get_credentials(self):
        flow = OAuth2WebServerFlow(client_id='495642085510-k0tmvj2m941jhre2nbqka17vqpjfddtd.apps.googleusercontent.com',
                                   client_secret='kOc9wMptUtxkcIFbtZCcrEAc',
                                   scope='https://www.googleapis.com/auth/bigquery',
                                   redirect_uri='urn:ietf:wg:oauth:2.0:oob')

        storage = Storage('bigquery_credentials.dat')
        credentials = storage.get()

        if credentials is None or credentials.invalid or self.reauth:
            credentials = run(flow, storage)

        return credentials

    def get_service(self, credentials):
        http = httplib2.Http()
        http = credentials.authorize(http)
        bigquery_service = build('bigquery', 'v2', http=http)

        return bigquery_service

    def run_query(self, query):
        job_collection = self.service.jobs()
        job_data = {
            'configuration': {
                'query': {
                    'query': query
                    #'allowLargeResults', 'createDisposition', 'preserveNulls', destinationTable, useQueryCache
                }
            }
        }

        try:
            query_reply = job_collection.insert(projectId=self.project_id,
                                           body=job_data).execute()
            status = query_reply['status']
        except AccessTokenRefreshError:
            raise AccessDeniedException("The credentials have been revoked or expired, please re-run"
                "the application to re-authorize")
        except HttpError as ex:
            status = json.loads(ex.content)['error']


        errors = status.get('errors', None)

        if errors:
            reasons = [error['reason'] for error in errors]
            if 'accessDenied' in reasons:
                raise AccessDeniedException
            if 'invalidQuery' in reasons:
                raise InvalidQueryException
            if 'notFound' in reasons:
                raise NotFoundException
            if 'termsOfServiceNotAccepted' in reasons:
                raise TermsOfServiceNotAcceptedException
            else:
                raise UnknownGBQException(errors)

        job_reference = query_reply['jobReference']

        while(not 'jobComplete' in query_reply):
            print('Job not yet complete...')
            query_reply = job_collection.getQueryResults(
                            projectId=job_reference['projectId'],
                            jobId=job_reference['jobId']).execute()

        total_rows = int(query_reply['totalRows'])
        result_pages = list()
        seen_page_tokens = list()
        current_row = 0
        #Only read schema on first page
        schema = query_reply['schema']

        # Loop through each page of data
        while('rows' in query_reply and current_row < total_rows):
            page = query_reply['rows']
            result_pages.append(page)
            current_row += len(page)
            page_token = query_reply.get('pageToken', None)

            if not page_token and current_row < total_rows:
                raise InvalidPageToken("Required pageToken was missing. Recieved {0} of {1} rows".format(current_row,total_rows))

            elif page_token in seen_page_tokens:
                raise InvalidPageToken("A duplicate pageToken was returned")

            seen_page_tokens.append(page_token)
            query_reply = job_collection.getQueryResults(
                            projectId = job_reference['projectId'],
                            jobId = job_reference['jobId'],
                            pageToken = page_token).execute()

        if (current_row < total_rows):
            raise InvalidPageToken()

        return schema, result_pages

    def load_data(self, dataframe, dataset_id, table_id, chunksize, verbose):
        job_id = uuid.uuid4().hex
        rows = []
        remaining_rows = len(dataframe)

        if verbose:
            total_rows = remaining_rows
            sys.stdout.write("\n\n")
            sys.stdout.flush()

        for index, row in dataframe.reset_index(drop=True).iterrows():
            row_dict = dict()
            row_dict['json'] = json.loads(row.to_json(force_ascii = False,
                                                      date_unit = 's',
                                                      date_format = 'iso'))
            row_dict['insertId'] = job_id + str(index)
            rows.append(row_dict)
            remaining_rows -= 1

            if (len(rows) % chunksize == 0) or (remaining_rows == 0):
                if verbose:
                    sys.stdout.write("\rStreaming Insert is {0}% Complete".format(((total_rows - remaining_rows) * 100) / total_rows))
                    sys.stdout.flush()

                body = {'rows': rows}
                response = self.service.tabledata().insertAll(
                                                        projectId = self.project_id,
                                                        datasetId = dataset_id,
                                                        tableId = table_id,
                                                        body = body).execute()
                if 'insertErrors' in response:
                    raise UnknownGBQException(response)

                sleep(1) # Maintains the inserts "per second" rate per API
                rows = []

        if verbose:
            sys.stdout.write("\n")
            sys.stdout.flush()

def _parse_data(schema, rows):
    # see: http://pandas.pydata.org/pandas-docs/dev/missing_data.html#missing-data-casting-rules-and-indexing
    dtype_map = {'INTEGER': np.dtype(float),
                 'FLOAT': np.dtype(float),
                 'TIMESTAMP': 'M8[ns]'}     # This seems to be buggy without
                                            # nanosecond indicator

    fields = schema['fields']
    col_types = [field['type'] for field in fields]
    col_names = [field['name'].encode('ascii', 'ignore') for field in fields]
    col_dtypes = [dtype_map.get(field['type'], object) for field in fields]
    page_array = np.zeros((len(rows),),
                          dtype=zip(col_names, col_dtypes))

    for row_num, raw_row in enumerate(rows):
        entries = raw_row.get('f', [])
        for col_num, field_type in enumerate(col_types):
            field_value = _parse_entry(entries[col_num].get('v', ''),
                                       field_type)
            page_array[row_num][col_num] = field_value

    return DataFrame(page_array)

def _parse_entry(field_value, field_type):
    if field_value is None or field_value == 'null':
        return None
    if field_type == 'INTEGER' or field_type == 'FLOAT':
        return float(field_value)
    elif field_type == 'TIMESTAMP':
        timestamp = datetime.utcfromtimestamp(float(field_value))
        return np.datetime64(timestamp)
    elif field_type == 'BOOLEAN':
        return field_value == 'true'
    return field_value

def _test_imports():
    _GOOGLE_API_CLIENT_INSTALLED
    _GOOGLE_API_CLIENT_VALID_VERSION
    _GOOGLE_FLAGS_INSTALLED
    _GOOGLE_FLAGS_VALID_VERSION
    _HTTPLIB2_INSTALLED

    if compat.PY3:
        raise NotImplementedError("Google's libraries do not support Python 3 yet")

    if not _GOOGLE_API_CLIENT_INSTALLED:
        raise ImportError('Could not import Google API Client.')

    if not _GOOGLE_FLAGS_INSTALLED:
        raise ImportError('Could not import Google Command Line Flags Module.')

    if not _GOOGLE_API_CLIENT_VALID_VERSION:
        raise ImportError("pandas requires google-api-python-client >= 1.2.0 for Google "
                          "BigQuery support, current version " + _GOOGLE_API_CLIENT_VERSION)

    if not _GOOGLE_FLAGS_VALID_VERSION:
        raise ImportError("pandas requires python-gflags >= 2.0.0 for Google "
                          "BigQuery support, current version " + _GOOGLE_FLAGS_VERSION)

    if not _HTTPLIB2_INSTALLED:
        raise ImportError("pandas requires httplib2 for Google BigQuery support")

def read_gbq(query, project_id = None, index_col=None, col_order=None, reauth=False):
    """Load data from Google BigQuery.

    THIS IS AN EXPERIMENTAL LIBRARY

    The main method a user calls to execute a Query in Google BigQuery and read results
    into a pandas DataFrame using the v2 Google API client for Python.  Documentation for
    the API is available at https://developers.google.com/api-client-library/python/.
    Authentication to the Google BigQuery service is via OAuth 2.0 using the product name
    'pandas GBQ'.

    Parameters
    ----------
    query : str
        SQL-Like Query to return data values
    project_id : str
        Google BigQuery Account project ID.
    index_col : str (optional)
        Name of result column to use for index in results DataFrame
    col_order : list(str) (optional)
        List of BigQuery column names in the desired order for results
        DataFrame
    reauth : boolean (default False)
        Force Google BigQuery to reauthenticate the user. This is useful
        if multiple accounts are used.

    Returns
    -------
    df: DataFrame
        DataFrame representing results of query

    """

    _test_imports()

    if not project_id:
        raise TypeError("Missing required parameter: project_id")

    connector = GbqConnector(project_id, reauth = reauth)
    schema, pages = connector.run_query(query)
    dataframe_list = []
    while len(pages) > 0:
        page = pages.pop()
        dataframe_list.append(_parse_data(schema, page))

    final_df = concat(dataframe_list, ignore_index = True)

    # Reindex the DataFrame on the provided column
    if index_col is not None:
        if index_col in final_df.columns:
            final_df.set_index(index_col, inplace = True)
        else:
            raise InvalidColumnOrder(
                'Index column "{0}" does not exist in DataFrame.'
                .format(index_col)
            )

    # Change the order of columns in the DataFrame based on provided list
    if col_order is not None:
        if sorted(col_order) == sorted(final_df.columns):
            final_df = final_df[col_order]
        else:
            raise InvalidColumnOrder(
                'Column order does not match this DataFrame.'
            )

    # Downcast floats to integers and objects to booleans
    # if there are no NaN's. This is presently due to a
    # limitation of numpy in handling missing data.
    final_df._data = final_df._data.downcast(dtypes='infer')
    return final_df

def to_gbq(dataframe, destination_table, project_id=None, chunksize=10000,
           verbose=True, reauth=False):
    """Write a DataFrame to a Google BigQuery table.

    THIS IS AN EXPERIMENTAL LIBRARY

    If the table exists, the dataframe will be written to the table using
    the defined table schema and column types. For simplicity, this method
    uses the Google BigQuery streaming API. The to_gbq method chunks data
    into a default chunk size of 10,000. Failures return the complete error
    response which can be quite long depending on the size of the insert. 
    There are several important limitations of the Google streaming API 
    which are detailed at:
    https://developers.google.com/bigquery/streaming-data-into-bigquery.

    Parameters
    ----------
    dataframe : DataFrame
        DataFrame to be written
    destination_table : string
        Name of table to be written, in the form 'dataset.tablename'
    project_id : str
        Google BigQuery Account project ID.
    chunksize : int (default 10000)
        Number of rows to be inserted in each chunk from the dataframe.
    verbose : boolean (default True)
        Show percentage complete
    reauth : boolean (default False)
        Force Google BigQuery to reauthenticate the user. This is useful
        if multiple accounts are used.

    """
    _test_imports()

    if not project_id:
        raise TypeError("Missing required parameter: project_id")

    if not '.' in destination_table:
        raise NotFoundException("Invalid Table Name. Should be of the form 'datasetId.tableId' ")

    connector = GbqConnector(project_id, reauth = reauth)
    dataset_id, table_id = destination_table.rsplit('.',1)

    connector.load_data(dataframe, dataset_id, table_id, chunksize, verbose)