#!/usr/bin/env perl
#
#   A script that creates a JSON array of simulated BFF/PXF
#
#   Note: Check also Monarch Initiative:
#   https://github.com/monarch-initiative/PhenoImp
#
#   Last Modified: Nov/08/2024
#
#   $VERSION taken from Pheno::Ranker
#
#   Copyright (C) 2023-2025 Manuel Rueda - CNAG (manuel.rueda@cnag.eu)
#
#   License: Artistic License 2.0
#
#   If this program helps you in your research, please cite.

use strict;
use warnings;
use Getopt::Long qw(:config no_ignore_case);
use Pod::Usage;

##### Main #####
randomize_ga4gh();
################
exit;

sub randomize_ga4gh {
    my $VERSION = '0.09';
    my $format  = 'bff';
    my $number  = 100;
    my $output  = 'individuals.json';
    my ( $phenotypicFeatures, $diseases, $treatments, $procedures, $exposures,
        $ethnicity )
      = (1) x 6;

    # Reading arguments
    GetOptions(
        'format|f=s' => \$format,    # string
        'number|n=i' => \$number,    # string
        'output|o=s' => \$output,    # string

        #
        'diseases=i'           => \$diseases,              # integer
        'exposures=i'          => \$exposures,             # integer
        'phenotypicFeatures=i' => \$phenotypicFeatures,    # integer
        'procedures=i'         => \$procedures,            # integer
        'treatments=i'         => \$treatments,            # integer

        #
        'max-diseases-pool=i'           => \my $max_diseases_pool,             # integer
        'max-ethnicity-pool=i'          => \my $max_ethnicity_pool,            # integer
        'max-exposures-pool=i'          => \my $max_exposures_pool,            # integer
        'max-phenotypicFeatures-pool=i' => \my $max_phenotypicFeatures_pool,   # integer
        'max-treatments-pool=i'         => \my $max_treatments_pool,           # integer
        'max-procedures-pool=i'         => \my $max_procedures_pool,           # integer

        #
        'random-seed=i'         => \my $random_seed,                           # integer
        'external-ontologies=s' => \my $ext_ontologies,                        # string

        #
        'help|?'    => \my $help,                                              # flag
        'man'       => \my $man,                                               # flag
        'debug=i'   => \my $debug,                                             # integer
        'verbose|'  => \my $verbose,                                           # flag
        'version|V' => sub { print "$0 Version $VERSION\n"; exit; }
    ) or pod2usage(2);
    pod2usage(1)                              if $help;
    pod2usage( -verbose => 2, -exitval => 0 ) if $man;

    # Create object
    my $randomize = Randomizer->new(
        {
            format                      => $format,
            number                      => $number,
            output                      => $output,
            diseases                    => $diseases,
            ethnicity                   => $ethnicity,
            exposures                   => $exposures,
            phenotypicFeatures          => $phenotypicFeatures,
            procedures                  => $procedures,
            treatments                  => $treatments,
            max_diseases_pool           => $max_diseases_pool,
            max_ethnicity_pool          => $max_ethnicity_pool,
            max_exposures_pool          => $max_exposures_pool,
            max_phenotypicFeatures_pool => $max_phenotypicFeatures_pool,
            max_procedures_pool         => $max_procedures_pool,
            max_treatments_pool         => $max_treatments_pool,
            random_seed                 => $random_seed,
            ext_ontologies              => $ext_ontologies,
            debug                       => $debug,
            verbose                     => $verbose
        }
    );

    # Run method
    $randomize->run;
}

package Randomizer;

use strict;
use warnings;
use autodie;
use feature qw(say);

#use Data::Printer;
use Data::Dumper;
use Path::Tiny;
use List::Util 1.50 qw(head shuffle);
use JSON::XS;
use Data::Fake qw(Core Company Dates Names);
use FindBin    qw($Bin);
use lib $Bin;
use Ontologies
  qw($hpo_array $omim_array $rxnorm_array $ncit_procedures_array $ncit_exposures_array $ethnicity_array);

sub new {
    my ( $class, $self ) = @_;
    bless $self, $class;
    return $self;
}

sub run {
    my $self        = shift;
    my $number      = $self->{number};
    my $format      = $self->{format};
    my $output      = $self->{output};
    my $random_seed = $self->{random_seed};
    my %func        = (
        bff => \&bff_generator,
        pxf => \&pxf_generator
    );

    # Set seed if defined
    srand($random_seed) if defined $random_seed;    # user can set it to 0

    # Load external ontologies file if present
    $self->{ontologies_data} =
      $self->{ext_ontologies}
      ? validate_json( $self->{ext_ontologies} )
      : undef;                                      # setter

    #########
    # START #
    #########

    my $json_data;
    for ( my $i = 1 ; $i <= $number ; $i++ ) {
        push @$json_data, $func{$format}->( $i, $self );
    }

    #######
    # END #
    #######
    #p $json_data;

    # Serialize the data and write
    write_json( { filepath => $output, data => $json_data } );
}

sub write_json {
    my $arg       = shift;
    my $file      = $arg->{filepath};
    my $json_data = $arg->{data};

    # Note that canonical DOES not match the order of nsort from Sort::Naturally
    my $json = JSON::XS->new->utf8->canonical->pretty->encode($json_data);
    path($file)->spew_utf8($json);
    return 1;
}

sub pxf_generator {
    my ( $id, $self ) = @_;
    my $result_hash = run_functions($self);
    my $pxf         = fake_hash(
        {
            id      => "Phenopacket_" . $id,
            subject => {
                id  => "IndividualId_" . $id,
                age => {
                    iso8601duration =>
                      fake_template( "P%dY", fake_int_mod( 1, 99 ) )
                },
                sex => fake_pick_mod( [ 'MALE', 'FEMALE' ] )
            },
            diseases           => $result_hash->{diseases},
            phenotypicFeatures => $result_hash->{phenotypicFeatures},
            medicalActions     => merge_medical_actions($result_hash)
        }
    );
    return $pxf->();
}

sub merge_medical_actions {
    my $hash = shift;

    # Initialize empty arrays for treatments and procedures
    my @processed_treatments;
    my @processed_procedures;

    # Process treatments if defined
    if ( defined $hash->{treatments} ) {
        @processed_treatments =
          map { { treatment => $_ } } @{ $hash->{treatments} };
    }

    # Process procedures if defined
    if ( defined $hash->{procedures} ) {
        @processed_procedures =
          map { { procedure => $_ } } @{ $hash->{procedures} };
    }

    # Merge the processed arrays and return a reference
    # NB: If undef no elements will be added
    return [ @processed_treatments, @processed_procedures ];
}

sub bff_generator {
    my ( $id, $self ) = @_;
    my $default_array = [];
    my $result_hash   = run_functions($self);
    my $bff           = fake_hash(
        {
            id        => "Beacon_" . $id,
            ethnicity => $result_hash->{ethnicity},
            sex       => fake_pick_mod(
                [
                    { id => "NCIT:C20197", label => "Male" },
                    { id => "NCIT:C16576", label => "Female" }
                ]
            ),
            diseases           => $result_hash->{diseases} // $default_array,
            phenotypicFeatures => $result_hash->{phenotypicFeatures}
              // $default_array,
            treatments => $result_hash->{treatments} // $default_array,
            interventionsOrProcedures => $result_hash->{procedures}
              // $default_array,
            exposures => $result_hash->{exposures} // $default_array
        }
    );
    return $bff->();
}

######################
#  START ARRAY TERMS #
######################

sub create_entries {
    my ( $params, $ontologies_array, $n, $max ) = @_;
    my $shuffled_slice = shuffle_slice( $max, $ontologies_array );
    my $array;
    for ( my $i = 0 ; $i < $n ; $i++ ) {
        push @$array,
          $params->{entry_creator}->( $shuffled_slice->[$i], $params );
    }
    return $array;
}

sub common_entry_creator {
    my ( $element, $params ) = @_;
    return {
        $params->{type}  => $element,
        $params->{onset} => {
            age => {
                iso8601duration =>
                  fake_template( "P%dY", fake_int_mod( 1, 99 ) )
            }
        }
    };
}

sub phenotypicFeatures {
    my ( $format, $ontologies_array, $n, $max ) = @_;
    my $params = {
        type          => $format eq 'bff' ? 'featureType' : 'type',
        onset         => $format eq 'bff' ? 'ageOfOnset'  : 'onset',
        entry_creator => \&common_entry_creator
    };
    return create_entries( $params, $ontologies_array, $n, $max );
}

sub diseases {
    my ( $format, $ontologies_array, $n, $max ) = @_;
    my $params = {
        type          => $format eq 'bff' ? 'diseaseCode' : 'term',
        onset         => $format eq 'bff' ? 'ageOfOnset'  : 'onset',
        entry_creator => \&common_entry_creator
    };
    return create_entries( $params, $ontologies_array, $n, $max );
}

sub treatments {
    my ( $format, $ontologies_array, $n, $max ) = @_;
    my $params = {
        entry_creator => sub {
            my ( $element, $p ) = @_;
            return $format eq 'bff'
              ? { treatmentCode => $element }
              : { agent         => $element };
        }
    };
    return create_entries( $params, $ontologies_array, $n, $max );
}

sub procedures {
    my ( $format, $ontologies_array, $n, $max ) = @_;
    my $params = {
        type          => $format eq 'bff' ? 'procedureCode'  : 'term',
        onset         => $format eq 'bff' ? 'ageAtProcedure' : 'onset',
        entry_creator => \&common_entry_creator
    };
    return create_entries( $params, $ontologies_array, $n, $max );
}

sub exposures {
    my ( $format, $ontologies_array, $n, $max ) = @_;
    my $default_duration = 'P999Y';
    my $default_ontology_term =
      { id => 'NCIT:C126101', label => 'Not Available' };
    my $params = {
        type          => $format eq 'bff' ? 'exposureCode'  : 'term',
        onset         => $format eq 'bff' ? 'ageAtExposure' : 'onset',
        entry_creator => \&common_entry_creator
    };
    my $entries = create_entries( $params, $ontologies_array, $n, $max );

    # Add 'duration' 'and 'unit' to pass bff-validator
    foreach my $entry (@$entries) {
        $entry->{duration} = $default_duration;
        $entry->{unit}     = $default_ontology_term;
    }
    return $entries;
}

sub ethnicity {
    my ( undef, $ontologies_array, undef, $max ) = @_;
    my $shuffled_slice = shuffle_slice( $max, $ontologies_array );
    return $shuffled_slice->[0];
}

####################
#  END ARRAY TERMS #
####################

sub load_ontology_hash {
    my $self = shift;
    my %ont  = (
        diseases           => $omim_array,
        ethnicity          => $ethnicity_array,
        exposures          => $ncit_exposures_array,
        phenotypicFeatures => $hpo_array,
        procedures         => $ncit_procedures_array,
        treatments         => $rxnorm_array
    );
    return \%ont;
}

sub run_functions {
    my $self       = shift;
    my $ontologies = load_ontology_hash($self);

    my %func = (
        diseases           => \&diseases,
        ethnicity          => \&ethnicity,
        exposures          => \&exposures,
        phenotypicFeatures => \&phenotypicFeatures,
        procedures         => \&procedures,
        treatments         => \&treatments
    );

    my %hash;

    # *** IMPORTANT ***
    # sort keys (below) is mandatory for reproducibility
    for my $key ( sort keys %func ) {
        my $ontologies_array =
          exists $self->{ontologies_data}{$key}
          ? $self->{ontologies_data}{$key}
          : $ontologies->{$key};
        $hash{$key} = $func{$key}->(
            $self->{format}, $ontologies_array, $self->{$key},
            $self->{ 'max_' . $key . '_pool' }
        );
    }

    return \%hash;
}

sub shuffle_slice {
    my ( $max, $array ) = @_;

    # head   -> 1.50 List::Util (5.26 has 1.4602)
    #my @items = sample $count, @values; # 1.54 List::Util
    # *** IMPORTANT ***
    # If $max was defined by the user then use it, otherwise @$array;
    my @slice          = defined $max ? head $max, @$array : @$array;    # slice of refs
    my @shuffled_slice = shuffle @slice;
    return wantarray ? @shuffled_slice : \@shuffled_slice;
}

sub fake_int_mod {

    # This subroutine was built because fake_int did not respond to srand
    my ( $low, $high ) = @_;
    my $range = $high - $low;
    return int( rand($range) ) + 1;
}

sub fake_pick_mod {

    # This subroutine was built because fake_pick did not respond to srand
    # NB: The original from Data::Fake worked with array (not with arrayref)
    my $array = shift;
    return $array->[ int( rand(@$array) ) ];
}

sub validate_json {
    my $file   = shift;
    my $data   = read_yaml($file);
    my $schema = {
        '$schema'  => 'http://json-schema.org/draft-07/schema#',
        type       => "object",
        properties => {
            diseases           => { '$ref' => '#/$defs/array' },
            phenotypicFeatures => { '$ref' => '#/$defs/array' },
            treatments         => { '$ref' => '#/$defs/array' },
            procedures         => { '$ref' => '#/$defs/array' },
            exposures          => { '$ref' => '#/$defs/array' },
            ethnicity          => { '$ref' => '#/$defs/array' }
        },
        '$defs' => {
            array => {
                type  => "array",
                items => { '$ref' => '#/$defs/item' }
            },
            item => {
                type       => "object",
                required   => [ "id", "label" ],
                properties => {
                    id => { type => "string", pattern => qq/^\\w[^:]+:.+\$/ },
                    label => { type => "string" }
                }
            }
        }
    };

    # Load at runtime
    require JSON::Validator;

    # Create object and load schema
    my $jv = JSON::Validator->new;

    # Load schema in object
    $jv->schema($schema);

    # Validate data
    my @errors = $jv->validate($data);

    # Show error if any
    say_errors( \@errors ) and die if @errors;

    # return data if ok
    return $data;

}

sub say_errors {
    my $errors = shift;
    if ( @{$errors} ) {
        say join "\n", @{$errors};
    }
    return 1;
}

sub read_yaml {

    # Load at runtime
    require YAML::XS;
    YAML::XS->import('LoadFile');
    return LoadFile(shift);    # Decode to Perl data structure
}

1;

=head1 NAME

bff-pxf-simulator: A script that creates a JSON array of simulated BFF/PXF

=head1 SYNOPSIS

 bff-pxf-simulator [-options]

   Options:
     -f, --format <format>            Format [bff|pxf]
     -n, --number <number>            Set the number of individuals to generate [100]
     -o, --output <file>              Output file [individuals.json]
     --external-ontologies <file>     Path to a YAML file containing ontology terms
     --random-seed <seed>             Initializes pseudorandom number sequences (seed must be an integer)

     --diseases <number>              Set the number of diseases per individual [1]
     --exposures <number>             Set the number of exposures per individual [1]
     --phenotypicFeatures <number>    Set the number of phenotypic features per individual [1]
     --procedures <number>            Set the number of procedures per individual [1]
     --treatments <number>            Set the number of treatments per individual [1]
     --max-[term]-pool <size>         Limit the selection to the first N elements of the term array
     --max-ethnicity-pool <size>      Restrict the ethnicity pool size; each individual will have only one ethnicity

   Generic Options:
     -debug <level>                   Print debugging (from 1 to 5, being 5 max)
     -h, --help                       Brief help message
     -man                             Full documentation
     -v, --verbose                    Verbosity on
     -V, --version                    Print version

=head1 DESCRIPTION

This script generates a JSON array of simulated BFF/PXF data. The files can be created based on pre-loaded ontologies or by utilizing an external YAML file.

=head1 SUMMARY

A script that creates a JSON array of simulated BFF/PXF. 

Implemented array terms:

B<BFF:> C<diseases, exposures, interventionsOrProcedures, phenotypicFeatures, treatments>. 

procedures = interventionsOrProcedures

B<PXF:> C<interventionsOrProcedures, medicalActions.procedure, medicalActions.treatment, phenotypicFeatures>.

procedures = medicalActions.procedure

treatments = medicalActions.treatment

=head1 INSTALLATION

(only needed if you did not install C<Pheno-Ranker>)

 $ cpanm --installdeps .

=head3 System requirements

  * Ideally a Debian-based distribution (Ubuntu or Mint), but any other (e.g., CentOs, OpenSuse) should do as well.
  * Perl 5 (>= 5.10 core; installed by default in most Linux distributions). Check the version with "perl -v"
  * 1GB of RAM.
  * 1 core (it only uses one core per job).
  * At least 1GB HDD.

=head1 HOW TO RUN BFF-PXF-SIMULATOR

When run without any arguments, the software will use default settings. To modify any parameters, please refer to the synopsis for guidance.

If you prefer not to include a specific term in the analysis, set its value to zero. For example:

C<--treatments 0>

B<Examples:>

 $ ./bff-pxf-simulator -f pxf  # BFF with 100 samples

 $ ./bff-pxf-simulator -f pxf -n 1000 -o pxf.json # PXF with 1K samples and saved to pxf.json

 $ ./bff-pxf-simulator -phenotypicFeatures 10 # BFF with 100 samples and 10 pF each

 $ ./bff-pxf-simulator -diseases 0 -exposures 0 -procedures 0 -phenotypicFeatures 0 -treatments 0 # Only sex and ethnicity

=head2 COMMON ERRORS AND SOLUTIONS

 * Error message: Foo
   Solution: Bar

 * Error message: Foo
   Solution: Bar

=head1 AUTHOR 

Written by Manuel Rueda, PhD. Info about CNAG can be found at L<https://www.cnag.eu>.

=head1 COPYRIGHT AND LICENSE

This PERL file is copyrighted. See the LICENSE file included in this distribution.

=cut
