https://linen.dev logo
#advice-data-ingestion
Title
# advice-data-ingestion
t

Tien Nguyen

09/24/2022, 10:56 PM
Copy code
#
# Copyright (c) 2021 Airbyte, Inc., all rights reserved.
#


import json
import os.path
import datetime
from typing import Dict, Generator,Mapping,Any, Union

from .kindful_api import *
from airbyte_cdk.logger import AirbyteLogger
from airbyte_cdk.models import (
    AirbyteCatalog,
    AirbyteConnectionStatus,
    AirbyteMessage,
    AirbyteRecordMessage,
AirbyteStateMessage,
    AirbyteStream,
    ConfiguredAirbyteCatalog,
    Status,
    Type

)
from airbyte_cdk.sources import Source

class SourceKindfulApi(Source):

    def check(self, logger: AirbyteLogger, config: json) -> AirbyteConnectionStatus:
        """
        Tests if the input configuration can be used to successfully connect to the integration
            e.g: if a provided Stripe API token can be used to connect to the Stripe API.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json file

        :return: AirbyteConnectionStatus indicating a Success or Failure
        """
        try:
            # Not Implemented

            return AirbyteConnectionStatus(status=Status.SUCCEEDED)
        except Exception as e:
            return AirbyteConnectionStatus(status=Status.FAILED, message=f"An exception occurred: {str(e)}")

    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration.
        For example, given valid credentials to a Postgres database,
        returns an Airbyte catalog where each postgres table is a stream, and each table column is a field.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json file

        :return: AirbyteCatalog is an object describing a list of all available streams in this source.
            A stream is an AirbyteStream object that includes:
            - its stream name (or table name in the case of Postgres)
            - json_schema providing the specifications of expected schema for this stream (a list of columns described
            by their names and types)
        """

        streams = []
        dirname= os.path.dirname(os.path.realpath(__file__))
        ##transaction
        transaction_spec_path=os.path.join(dirname, "json_schema/transaction.json")
        transaction_catalog=read_json(transaction_spec_path)
        ### contact
        contact_spec_path=os.path.join(dirname,"json_schema/contact.json")
        contact_catalog=read_json(contact_spec_path)

        #### Meta Json
        funds=os.path.join(dirname,"json_schema/funds.json")
        funds_catalog=read_json(funds)

        ### Group
        group=os.path.join(dirname,"json_schema/group.json")
        group_catalog=read_json(group)


        ### not yet Implement Incremental Sync --> Need more direction
        # streams.append(AirbyteStream(name="transaction",json_schema=catalog,supported_sync_modes=["full_refresh","incremental"],source_defined_cursor=True,
        #             default_cursor_field=["created_at"])
        #                )

        streams.append(AirbyteStream(name="transaction",json_schema=transaction_catalog,supported_sync_modes=["full_refresh","incremental"], source_defined_cursor=True,default_cursor_field=["updated_at"]))
        streams.append(AirbyteStream(name="contact",json_schema=contact_catalog,supported_sync_modes=["full_refresh","incremental"],source_defined_cursor=True,default_cursor_field=["updated_at"]))
        streams.append(AirbyteStream(name="funds", json_schema=funds_catalog, supported_sync_modes=["full_refresh","incremental"],source_defined_cursor=True,default_cursor_field=["updated_at"]))
        streams.append(AirbyteStream(name="group", json_schema=group_catalog, supported_sync_modes=["full_refresh","incremental"],source_defined_cursor=True,default_cursor_field=["updated_at"]))
        return AirbyteCatalog(streams=streams)

    def read(
        self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """
        token = authorization("<your token>")
        contact = contact_query(token)
        transaction=transaction_query(token)
        meta=meta_api(token)

        data_query = {
            "query":
                [
                    "not_linked"
                ]
        }
        contact.query = data_query
        transaction.query=data_query
        # updated_cursor = datetime(datetime.now().year, datetime.now().month, datetime.now().day).isoformat()

        for stream in catalog.streams:
            if stream.stream.name=="transaction":
                for data_ in transaction:
                    ## Cursor check if updated time is less than then write
                    date_,check=time_check(data_)

                    if check ==False:
                        continue
                    data_["updated_at"] = date_
                    yield generate_state(state, stream, {"cursor": data_["updated_at"]})
                    yield generate_record(stream,data_)
            elif stream.stream.name=="contact":
                for data_ in contact:
                    date_, check = time_check(data_)

                    if check == False:
                        continue
                    data_["updated_at"] = date_
                    yield generate_state(state,stream,{"cursor":data_["updated_at"]})
                    yield generate_record(stream,data_)
            elif stream.stream.name=="funds":
                for data_ in meta.funds:
                    date_, check = time_check(data_)
                    if check == False:
                        continue
                    data_["updated_at"] = date_
                    yield generate_state(state, stream, {"cursor": data_["updated_at"]})
                    yield generate_record(stream,data_)
            elif stream.stream.name=="group":
                for data_ in meta.groups:
                    date_, check = time_check(data_)
                    if check == False:
                        continue
                    yield generate_state(state, stream, {"cursor": data_["updated_at"]})
                    data_["updated_at"] = date_
                    yield generate_record(stream,data_)
            else:
                raise TypeError("Unknown stream")

def get_stream_cursor(state: Dict[str, any], stream: str) -> int:
    cursor = (state[stream]["cursor"] or None) if stream in state else None
    return cursor

def read_json(file_path):
    with open(file_path,"r") as f:
        return json.loads(f.read())
def generate_record(stream: any, data: any):
    dict = data.copy()

    # timestamps need to be emitted in ISO format
    for key in dict:
        if isinstance(dict[key], datetime.datetime):
            dict[key] = dict[key].isoformat()
    # timestamps need to be emitted in ISO format

    return AirbyteMessage(
        type=Type.RECORD,
        record=AirbyteRecordMessage(stream=stream.stream.name, data=dict, emitted_at=int(datetime.datetime.now().timestamp()) * 1000),
    )
def generate_state(state: Dict[str, any], stream: any, data: any):

    state[
        stream.stream.name
    ] = data
    return AirbyteMessage(type=Type.STATE, state=AirbyteStateMessage(data=state))
def time_check(data:any) ->Union[bool]:

        ### try to convert to Iso8601 format
        ### format:
        """
        2011-07-17T00:57:48.000-06:00
        """

        time_st = data["updated_at"]
        try:
            time_value=datetime.datetime.fromisoformat(time_st)
            # date_ = f"{time_value.year}-{time_value.month}-{time_value.day}"
            # now=datetime.now(pytz.timezone("US/Central"))
            # if now > time_value:
            #     return True
            # else:
            #     return False
            return time_value.date(),True
        except ValueError:

            try:
                time_value = datetime.datetime.strptime(time_st, '%Y-%m-%d %H:%M:%S UTC')
                # date_ = f"{time_value.year}-{time_value.month}-{time_value.day}"
                # now = datetime.utcnow()
                return time_value.date(),True
                # if now >time_value:
                #     return True
                # else:
                #     return False
            except ValueError:
                ### return False for now
                return None, False
                # raise (f"Not supported time_stamp format {time_st}")
11 Views